864 files changed, 118289 insertions, 16327 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d525957594b6..61dbe52bb3a3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -732,4 +732,5 @@ module_exit(exit_v9fs)
 MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
 MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
 MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_DESCRIPTION("9P Client File System");
 MODULE_LICENSE("GPL");
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index cdf441f22e07..731e3d14b67d 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -52,7 +52,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 			   unsigned int flags);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 
 void v9fs_blank_wstat(struct p9_wstat *wstat);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0d28ecf668d0..b845ee18a80b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_blocks = 0;
 	inode->i_rdev = rdev;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mapping->a_ops = &v9fs_addr_operations;
 	inode->i_private = NULL;
 
@@ -1150,8 +1150,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 
 	set_nlink(inode, 1);
 
-	inode->i_atime.tv_sec = stat->atime;
-	inode->i_mtime.tv_sec = stat->mtime;
+	inode_set_atime(inode, stat->atime, 0);
+	inode_set_mtime(inode, stat->mtime, 0);
 	inode_set_ctime(inode, stat->mtime, 0);
 
 	inode->i_uid = v9ses->dfltuid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1312f68965ac..c7319af2f471 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -641,10 +641,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
-		inode->i_atime.tv_sec = stat->st_atime_sec;
-		inode->i_atime.tv_nsec = stat->st_atime_nsec;
-		inode->i_mtime.tv_sec = stat->st_mtime_sec;
-		inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+		inode_set_atime(inode, stat->st_atime_sec,
+				stat->st_atime_nsec);
+		inode_set_mtime(inode, stat->st_mtime_sec,
+				stat->st_mtime_nsec);
 		inode_set_ctime(inode, stat->st_ctime_sec,
 				stat->st_ctime_nsec);
 		inode->i_uid = stat->st_uid;
@@ -660,12 +660,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
 		inode->i_blocks = stat->st_blocks;
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
-			inode->i_atime.tv_sec = stat->st_atime_sec;
-			inode->i_atime.tv_nsec = stat->st_atime_nsec;
+			inode_set_atime(inode, stat->st_atime_sec,
+					stat->st_atime_nsec);
 		}
 		if (stat->st_result_mask & P9_STATS_MTIME) {
-			inode->i_mtime.tv_sec = stat->st_mtime_sec;
-			inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+			inode_set_mtime(inode, stat->st_mtime_sec,
+					stat->st_mtime_nsec);
 		}
 		if (stat->st_result_mask & P9_STATS_CTIME) {
 			inode_set_ctime(inode, stat->st_ctime_sec,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e00cf8109b3f..8604e3377ee7 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -68,7 +68,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 	struct p9_fid *fid;
 	int ret;
 
-	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
+	p9_debug(P9_DEBUG_VFS, "name = '%s' value_len = %zu\n",
 		 name, buffer_size);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
@@ -139,7 +139,8 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
 
 ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
+	/* Txattrwalk with an empty string lists xattrs instead */
+	return v9fs_xattr_get(dentry, "", buffer, buffer_size);
 }
 
 static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
@@ -162,27 +163,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
 	return v9fs_xattr_set(dentry, full_name, value, size, flags);
 }
 
-static struct xattr_handler v9fs_xattr_user_handler = {
+static const struct xattr_handler v9fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 
-static struct xattr_handler v9fs_xattr_trusted_handler = {
+static const struct xattr_handler v9fs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 
 #ifdef CONFIG_9P_FS_SECURITY
-static struct xattr_handler v9fs_xattr_security_handler = {
+static const struct xattr_handler v9fs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.get	= v9fs_xattr_handler_get,
 	.set	= v9fs_xattr_handler_set,
 };
 #endif
 
-const struct xattr_handler *v9fs_xattr_handlers[] = {
+const struct xattr_handler * const v9fs_xattr_handlers[] = {
 	&v9fs_xattr_user_handler,
 	&v9fs_xattr_trusted_handler,
 #ifdef CONFIG_9P_FS_SECURITY
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index b5636e544c8a..3ad5a802352a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -10,7 +10,7 @@
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
-extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern const struct xattr_handler * const v9fs_xattr_handlers[];
 
 ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
 			   void *buffer, size_t buffer_size);
diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..42837617a55b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 
 endif # BLOCK
@@ -255,7 +256,7 @@ config ARCH_SUPPORTS_HUGETLBFS
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
-	depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
 	help
@@ -267,6 +268,7 @@ config HUGETLBFS
 
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
+	select XARRAY_MULTI
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
diff --git a/fs/Makefile b/fs/Makefile
index f9541f40be4e..75522f88e763 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 20963002578a..3081edb09e46 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -242,6 +242,7 @@ struct inode *
 adfs_iget(struct super_block *sb, struct object_info *obj)
 {
 	struct inode *inode;
+	struct timespec64 ts;
 
 	inode = new_inode(sb);
 	if (!inode)
@@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	ADFS_I(inode)->attr      = obj->attr;
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
-	adfs_adfs2unix_time(&inode->i_mtime, inode);
-	inode->i_atime = inode->i_mtime;
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
+	adfs_adfs2unix_time(&ts, inode);
+	inode_set_atime_to_ts(inode, ts);
+	inode_set_mtime_to_ts(inode, ts);
+	inode_set_ctime_to_ts(inode, ts);
 
 	if (S_ISDIR(inode->i_mode)) {
 		inode->i_op	= &adfs_dir_inode_operations;
@@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 
 	if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) {
 		adfs_unix2adfs_time(inode, &attr->ia_mtime);
-		adfs_adfs2unix_time(&inode->i_mtime, inode);
+		adfs_adfs2unix_time(&attr->ia_mtime, inode);
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	}
 
 	/*
@@ -329,7 +332,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 	 * have the ability to represent them in our filesystem?
 	 */
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7ba93efc1143..fd669daa4e7b 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
 	mark_buffer_dirty_inode(dir_bh, dir);
 	affs_brelse(dir_bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	mark_inode_dirty(dir);
 
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 
 	affs_brelse(bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_inc_iversion(dir);
 	mark_inode_dirty(dir);
 
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 060746c63151..0210df8d3500 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		break;
 	}
 
-	inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
-		inode_set_ctime(inode,
-				(be32_to_cpu(tail->change.days) * 86400LL +
-				 be32_to_cpu(tail->change.mins) * 60 +
-				 be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
-				+ sys_tz.tz_minuteswest * 60, 0).tv_sec;
-	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode,
+			inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec,
+			0);
 	affs_brelse(bh);
 	unlock_new_inode(inode);
 	return inode;
@@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 	tail = AFFS_TAIL(sb, bh);
 	if (tail->stype == cpu_to_be32(ST_ROOT)) {
-		affs_secs_to_datestamp(inode->i_mtime.tv_sec,
+		affs_secs_to_datestamp(inode_get_mtime_sec(inode),
 				       &AFFS_ROOT_TAIL(sb, bh)->root_change);
 	} else {
 		tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
 		tail->size = cpu_to_be32(inode->i_size);
-		affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change);
+		affs_secs_to_datestamp(inode_get_mtime_sec(inode),
+				       &tail->change);
 		if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
 			uid = i_uid_read(inode);
 			gid = i_gid_read(inode);
@@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir)
 	inode->i_gid     = current_fsgid();
 	inode->i_ino     = block;
 	set_nlink(inode, 1);
-	inode->i_mtime   = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	atomic_set(&AFFS_I(inode)->i_opencnt, 0);
 	AFFS_I(inode)->i_blkcnt = 0;
 	AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 2fe4a5832fcf..d6b9758ee23d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 const struct export_operations affs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = affs_fh_to_dentry,
 	.fh_to_parent = affs_fh_to_parent,
 	.get_parent = affs_get_parent,
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 988c2ac7cece..926cb1188eba 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -409,10 +409,12 @@ static int afs_update_cell(struct afs_cell *cell)
 		if (ret == -ENOMEM)
 			goto out_wake;
 
-		ret = -ENOMEM;
 		vllist = afs_alloc_vlserver_list(0);
-		if (!vllist)
+		if (!vllist) {
+			if (ret >= 0)
+				ret = -ENOMEM;
 			goto out_wake;
+		}
 
 		switch (ret) {
 		case -ENODATA:
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 95bcbd7654d1..1f656005018e 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 	set_nlink(inode, 2);
 	inode->i_uid		= GLOBAL_ROOT_UID;
 	inode->i_gid		= GLOBAL_ROOT_GID;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks		= 0;
 	inode->i_generation	= 0;
 
@@ -114,6 +114,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
 	struct afs_net *net = afs_d2net(dentry);
 	const char *name = dentry->d_name.name;
 	size_t len = dentry->d_name.len;
+	char *result = NULL;
 	int ret;
 
 	/* Names prefixed with a dot are R/W mounts. */
@@ -131,9 +132,22 @@ static int afs_probe_cell_name(struct dentry *dentry)
 	}
 
 	ret = dns_query(net->net, "afsdb", name, len, "srv=1",
-			NULL, NULL, false);
-	if (ret == -ENODATA)
-		ret = -EDESTADDRREQ;
+			&result, NULL, false);
+	if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
+		ret = -ENOENT;
+	if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
+		struct dns_server_list_v1_header *v1 = (void *)result;
+
+		if (v1->hdr.zero == 0 &&
+		    v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
+		    v1->hdr.version == 1 &&
+		    (v1->status != DNS_LOOKUP_GOOD &&
+		     v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
+			return -ENOENT;
+
+	}
+
+	kfree(result);
 	return ret;
 }
 
@@ -252,20 +266,9 @@ static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
 	return 1;
 }
 
-/*
- * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
- * sleep)
- * - called from dput() when d_count is going to 0.
- * - return 1 to request dentry be unhashed, 0 otherwise
- */
-static int afs_dynroot_d_delete(const struct dentry *dentry)
-{
-	return d_really_is_positive(dentry);
-}
-
 const struct dentry_operations afs_dynroot_dentry_operations = {
 	.d_revalidate	= afs_dynroot_d_revalidate,
-	.d_delete	= afs_dynroot_d_delete,
+	.d_delete	= always_delete_dentry,
 	.d_release	= afs_d_release,
 	.d_automount	= afs_d_automount,
 };
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1c794a1896aa..78efc9719349 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -91,8 +91,8 @@ static int afs_inode_init_from_status(struct afs_operation *op,
 
 	t = status->mtime_client;
 	inode_set_ctime_to_ts(inode, t);
-	inode->i_mtime = t;
-	inode->i_atime = t;
+	inode_set_mtime_to_ts(inode, t);
+	inode_set_atime_to_ts(inode, t);
 	inode->i_flags |= S_NOATIME;
 	inode->i_uid = make_kuid(&init_user_ns, status->owner);
 	inode->i_gid = make_kgid(&init_user_ns, status->group);
@@ -204,7 +204,7 @@ static void afs_apply_status(struct afs_operation *op,
 	}
 
 	t = status->mtime_client;
-	inode->i_mtime = t;
+	inode_set_mtime_to_ts(inode, t);
 	if (vp->update_ctime)
 		inode_set_ctime_to_ts(inode, op->ctime);
 
@@ -253,7 +253,7 @@ static void afs_apply_status(struct afs_operation *op,
 		if (change_size) {
 			afs_set_i_size(vnode, status->size);
 			inode_set_ctime_to_ts(inode, t);
-			inode->i_atime = t;
+			inode_set_atime_to_ts(inode, t);
 		}
 	}
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index da73b97e19a9..7385d62c8cf5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -87,7 +87,7 @@ struct afs_addr_list {
 	enum dns_lookup_status	status:8;
 	unsigned long		failed;		/* Mask of addrs that failed locally/ICMP */
 	unsigned long		responded;	/* Mask of addrs that responded */
-	struct sockaddr_rxrpc	addrs[];
+	struct sockaddr_rxrpc	addrs[] __counted_by(max_addrs);
 #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
 };
 
@@ -553,6 +553,7 @@ struct afs_server_entry {
 };
 
 struct afs_server_list {
+	struct rcu_head		rcu;
 	afs_volid_t		vids[AFS_MAXTYPES]; /* Volume IDs */
 	refcount_t		usage;
 	unsigned char		nr_servers;
@@ -585,6 +586,7 @@ struct afs_volume {
 #define AFS_VOLUME_OFFLINE	4	/* - T if volume offline notice given */
 #define AFS_VOLUME_BUSY		5	/* - T if volume busy notice given */
 #define AFS_VOLUME_MAYBE_NO_IBULK 6	/* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_RM_TREE	7	/* - Set if volume removed from cell->volumes */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_volume	*cache;		/* Caching cookie */
 #endif
@@ -705,7 +707,7 @@ struct afs_permits {
 	refcount_t		usage;
 	unsigned short		nr_permits;	/* Number of records */
 	bool			invalidated;	/* Invalidated due to key change */
-	struct afs_permit	permits[];	/* List of permits sorted by key pointer */
+	struct afs_permit	permits[] __counted_by(nr_permits);	/* List of permits sorted by key pointer */
 };
 
 /*
@@ -1512,6 +1514,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *,
 extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
 extern int afs_activate_volume(struct afs_volume *);
 extern void afs_deactivate_volume(struct afs_volume *);
+bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason);
 extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
 extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
 extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
@@ -1541,7 +1544,7 @@ int afs_launder_folio(struct folio *);
 /*
  * xattr.c
  */
-extern const struct xattr_handler *afs_xattr_handlers[];
+extern const struct xattr_handler * const afs_xattr_handlers[];
 
 /*
  * yfsclient.c
diff --git a/fs/afs/main.c b/fs/afs/main.c
index eae288c8d40a..6425c81d07de 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26";
 const char afs_init_sysname[] = "aarch64_linux26";
 #elif defined(CONFIG_X86_32)
 const char afs_init_sysname[] = "i386_linux26";
-#elif defined(CONFIG_IA64)
-const char afs_init_sysname[] = "ia64_linux26";
 #elif defined(CONFIG_PPC64)
 const char afs_init_sysname[] = "ppc64_linux26";
 #elif defined(CONFIG_PPC32)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ed1644e7683f..d642d06a453b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -424,7 +424,7 @@ error_kill_call:
 	if (call->async) {
 		if (cancel_work_sync(&call->async_work))
 			afs_put_call(call);
-		afs_put_call(call);
+		afs_set_call_complete(call, ret, 0);
 	}
 
 	ac->error = ret;
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index ed9056703505..b59896b1de0a 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
 		for (i = 0; i < slist->nr_servers; i++)
 			afs_unuse_server(net, slist->servers[i].server,
 					 afs_server_trace_put_slist);
-		kfree(slist);
+		kfree_rcu(slist, rcu);
 	}
 }
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 95d713074dc8..a01a0fb2cdbb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -407,6 +407,10 @@ static int afs_validate_fc(struct fs_context *fc)
 			return PTR_ERR(volume);
 
 		ctx->volume = volume;
+		if (volume->type != AFSVL_RWVOL) {
+			ctx->flock_mode = afs_flock_mode_local;
+			fc->sb_flags |= SB_RDONLY;
+		}
 	}
 
 	return 0;
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index 488e58490b16..eb415ce56360 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -58,6 +58,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
 		}
 
 		/* Status load is ordered after lookup counter load */
+		if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
+			pr_warn("No record of cell %s\n", cell->name);
+			vc->error = -ENOENT;
+			return false;
+		}
+
 		if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
 			vc->error = -EDESTADDRREQ;
 			return false;
@@ -285,6 +291,7 @@ failed:
  */
 static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
 {
+	struct afs_cell *cell = vc->cell;
 	static int count;
 	int i;
 
@@ -294,6 +301,9 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
 
 	rcu_read_lock();
 	pr_notice("EDESTADDR occurred\n");
+	pr_notice("CELL: %s err=%d\n", cell->name, cell->error);
+	pr_notice("DNS: src=%u st=%u lc=%x\n",
+		  cell->dns_source, cell->dns_status, cell->dns_lookup_count);
 	pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
 		  vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
 
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 29d483c80281..115c081a8e2c 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -32,8 +32,13 @@ static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell,
 		} else if (p->vid > volume->vid) {
 			pp = &(*pp)->rb_right;
 		} else {
-			volume = afs_get_volume(p, afs_volume_trace_get_cell_insert);
-			goto found;
+			if (afs_try_get_volume(p, afs_volume_trace_get_cell_insert)) {
+				volume = p;
+				goto found;
+			}
+
+			set_bit(AFS_VOLUME_RM_TREE, &volume->flags);
+			rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes);
 		}
 	}
 
@@ -56,7 +61,8 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
 				 afs_volume_trace_remove);
 		write_seqlock(&cell->volume_lock);
 		hlist_del_rcu(&volume->proc_link);
-		rb_erase(&volume->cell_node, &cell->volumes);
+		if (!test_and_set_bit(AFS_VOLUME_RM_TREE, &volume->flags))
+			rb_erase(&volume->cell_node, &cell->volumes);
 		write_sequnlock(&cell->volume_lock);
 	}
 }
@@ -232,6 +238,20 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
 }
 
 /*
+ * Try to get a reference on a volume record.
+ */
+bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
+{
+	int r;
+
+	if (__refcount_inc_not_zero(&volume->ref, &r)) {
+		trace_afs_volume(volume->vid, r + 1, reason);
+		return true;
+	}
+	return false;
+}
+
+/*
  * Get a reference on a volume record.
  */
 struct afs_volume *afs_get_volume(struct afs_volume *volume,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e1c45341719b..4a168781936b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -424,7 +424,7 @@ try_next_key:
 
 	op->store.write_iter = iter;
 	op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
-	op->mtime = vnode->netfs.inode.i_mtime;
+	op->mtime = inode_get_mtime(&vnode->netfs.inode);
 
 	afs_wait_for_operation(op);
 
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 9048d8ccc715..64b2c0224f62 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = {
 	.get	= afs_xattr_get_volume,
 };
 
-const struct xattr_handler *afs_xattr_handlers[] = {
+const struct xattr_handler * const afs_xattr_handlers[] = {
 	&afs_xattr_afs_acl_handler,
 	&afs_xattr_afs_cell_handler,
 	&afs_xattr_afs_fid_handler,
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..d26222b7eefe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,8 +24,8 @@
 
 #include <linux/uaccess.h>
 
-static struct vfsmount *anon_inode_mnt __read_mostly;
-static struct inode *anon_inode_inode;
+static struct vfsmount *anon_inode_mnt __ro_after_init;
+static struct inode *anon_inode_inode __ro_after_init;
 
 /*
  * anon_inodefs_dname() is called from d_path().
diff --git a/fs/attr.c b/fs/attr.c
index a8ae5f6d9b16..bdf5deb06ea9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -308,9 +308,9 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
 	i_uid_update(idmap, attr, inode);
 	i_gid_update(idmap, attr, inode);
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index d5a44fa88acf..8c1d587b3eef 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -25,6 +25,8 @@
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 
 /* This is the range of ioctl() numbers we claim as ours */
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
@@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
 
 /* Initializing function */
 
-int autofs_fill_super(struct super_block *, void *, int);
+extern const struct fs_parameter_spec autofs_param_specs[];
+int autofs_init_fs_context(struct fs_context *fc);
 struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
 void autofs_clean_ino(struct autofs_info *);
 
-static inline int autofs_prepare_pipe(struct file *pipe)
+static inline int autofs_check_pipe(struct file *pipe)
 {
 	if (!(pipe->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
 	if (!S_ISFIFO(file_inode(pipe)->i_mode))
 		return -EINVAL;
+	return 0;
+}
+
+static inline void autofs_set_packet_pipe_flags(struct file *pipe)
+{
 	/* We want a packet pipe */
 	pipe->f_flags |= O_DIRECT;
 	/* We don't expect -EAGAIN */
 	pipe->f_flags &= ~O_NONBLOCK;
+}
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	int ret = autofs_check_pipe(pipe);
+	if (ret < 0)
+		return ret;
+	autofs_set_packet_pipe_flags(pipe);
 	return 0;
 }
 
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index d3f55e874338..b5e4dfa04ed0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -7,16 +7,11 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct dentry *autofs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_nodev(fs_type, flags, data, autofs_fill_super);
-}
-
 struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
-	.mount		= autofs_mount,
+	.init_fs_context = autofs_init_fs_context,
+	.parameters	= autofs_param_specs,
 	.kill_sb	= autofs_kill_sb,
 };
 MODULE_ALIAS_FS("autofs");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2b49662ed237..1f5db6863663 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -6,7 +6,6 @@
 
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
-#include <linux/parser.h>
 
 #include "autofs_i.h"
 
@@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = {
 	.evict_inode	= autofs_evict_inode,
 };
 
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
-	Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
-	Opt_ignore};
-
-static const match_table_t tokens = {
-	{Opt_fd, "fd=%u"},
-	{Opt_uid, "uid=%u"},
-	{Opt_gid, "gid=%u"},
-	{Opt_pgrp, "pgrp=%u"},
-	{Opt_minproto, "minproto=%u"},
-	{Opt_maxproto, "maxproto=%u"},
-	{Opt_indirect, "indirect"},
-	{Opt_direct, "direct"},
-	{Opt_offset, "offset"},
-	{Opt_strictexpire, "strictexpire"},
-	{Opt_ignore, "ignore"},
-	{Opt_err, NULL}
+enum {
+	Opt_direct,
+	Opt_fd,
+	Opt_gid,
+	Opt_ignore,
+	Opt_indirect,
+	Opt_maxproto,
+	Opt_minproto,
+	Opt_offset,
+	Opt_pgrp,
+	Opt_strictexpire,
+	Opt_uid,
 };
 
-static int parse_options(char *options,
-			 struct inode *root, int *pgrp, bool *pgrp_set,
-			 struct autofs_sb_info *sbi)
+const struct fs_parameter_spec autofs_param_specs[] = {
+	fsparam_flag	("direct",		Opt_direct),
+	fsparam_fd	("fd",			Opt_fd),
+	fsparam_u32	("gid",			Opt_gid),
+	fsparam_flag	("ignore",		Opt_ignore),
+	fsparam_flag	("indirect",		Opt_indirect),
+	fsparam_u32	("maxproto",		Opt_maxproto),
+	fsparam_u32	("minproto",		Opt_minproto),
+	fsparam_flag	("offset",		Opt_offset),
+	fsparam_u32	("pgrp",		Opt_pgrp),
+	fsparam_flag	("strictexpire",	Opt_strictexpire),
+	fsparam_u32	("uid",			Opt_uid),
+	{}
+};
+
+struct autofs_fs_context {
+	kuid_t	uid;
+	kgid_t	gid;
+	int	pgrp;
+	bool	pgrp_set;
+};
+
+/*
+ * Open the fd.  We do it here rather than in get_tree so that it's done in the
+ * context of the system call that passed the data and not the one that
+ * triggered the superblock creation, lest the fd gets reassigned.
+ */
+static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
+			   struct fs_parameter *param,
+			   struct fs_parse_result *result)
 {
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	int option;
-	int pipefd = -1;
-	kuid_t uid;
-	kgid_t gid;
+	struct file *pipe;
+	int ret;
 
-	root->i_uid = current_uid();
-	root->i_gid = current_gid();
+	if (param->type == fs_value_is_file) {
+		/* came through the new api */
+		pipe = param->file;
+		param->file = NULL;
+	} else {
+		pipe = fget(result->uint_32);
+	}
+	if (!pipe) {
+		errorf(fc, "could not open pipe file descriptor");
+		return -EBADF;
+	}
 
-	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
-	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+	ret = autofs_check_pipe(pipe);
+	if (ret < 0) {
+		errorf(fc, "Invalid/unusable pipe");
+		if (param->type != fs_value_is_file)
+			fput(pipe);
+		return -EBADF;
+	}
 
-	sbi->pipefd = -1;
+	autofs_set_packet_pipe_flags(pipe);
 
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_fd:
-			if (match_int(args, &pipefd))
-				return 1;
-			sbi->pipefd = pipefd;
-			break;
-		case Opt_uid:
-			if (match_int(args, &option))
-				return 1;
-			uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(uid))
-				return 1;
-			root->i_uid = uid;
-			break;
-		case Opt_gid:
-			if (match_int(args, &option))
-				return 1;
-			gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(gid))
-				return 1;
-			root->i_gid = gid;
-			break;
-		case Opt_pgrp:
-			if (match_int(args, &option))
-				return 1;
-			*pgrp = option;
-			*pgrp_set = true;
-			break;
-		case Opt_minproto:
-			if (match_int(args, &option))
-				return 1;
-			sbi->min_proto = option;
-			break;
-		case Opt_maxproto:
-			if (match_int(args, &option))
-				return 1;
-			sbi->max_proto = option;
-			break;
-		case Opt_indirect:
-			set_autofs_type_indirect(&sbi->type);
-			break;
-		case Opt_direct:
-			set_autofs_type_direct(&sbi->type);
-			break;
-		case Opt_offset:
-			set_autofs_type_offset(&sbi->type);
-			break;
-		case Opt_strictexpire:
-			sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
-			break;
-		case Opt_ignore:
-			sbi->flags |= AUTOFS_SBI_IGNORE;
-			break;
-		default:
-			return 1;
-		}
+	if (sbi->pipe)
+		fput(sbi->pipe);
+
+	sbi->pipefd = result->uint_32;
+	sbi->pipe = pipe;
+
+	return 0;
+}
+
+static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
+	int opt;
+
+	opt = fs_parse(fc, autofs_param_specs, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_fd:
+		return autofs_parse_fd(fc, sbi, param, &result);
+	case Opt_uid:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			return invalfc(fc, "Invalid uid");
+		ctx->uid = uid;
+		break;
+	case Opt_gid:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			return invalfc(fc, "Invalid gid");
+		ctx->gid = gid;
+		break;
+	case Opt_pgrp:
+		ctx->pgrp = result.uint_32;
+		ctx->pgrp_set = true;
+		break;
+	case Opt_minproto:
+		sbi->min_proto = result.uint_32;
+		break;
+	case Opt_maxproto:
+		sbi->max_proto = result.uint_32;
+		break;
+	case Opt_indirect:
+		set_autofs_type_indirect(&sbi->type);
+		break;
+	case Opt_direct:
+		set_autofs_type_direct(&sbi->type);
+		break;
+	case Opt_offset:
+		set_autofs_type_offset(&sbi->type);
+		break;
+	case Opt_strictexpire:
+		sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+		break;
+	case Opt_ignore:
+		sbi->flags |= AUTOFS_SBI_IGNORE;
 	}
-	return (sbi->pipefd < 0);
+
+	return 0;
 }
 
-int autofs_fill_super(struct super_block *s, void *data, int silent)
+static struct autofs_sb_info *autofs_alloc_sbi(void)
 {
-	struct inode *root_inode;
-	struct dentry *root;
-	struct file *pipe;
 	struct autofs_sb_info *sbi;
-	struct autofs_info *ino;
-	int pgrp = 0;
-	bool pgrp_set = false;
-	int ret = -EINVAL;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
-	pr_debug("starting up, sbi = %p\n", sbi);
+		return NULL;
 
-	s->s_fs_info = sbi;
 	sbi->magic = AUTOFS_SBI_MAGIC;
-	sbi->pipefd = -1;
-	sbi->pipe = NULL;
-	sbi->exp_timeout = 0;
-	sbi->oz_pgrp = NULL;
-	sbi->sb = s;
-	sbi->version = 0;
-	sbi->sub_version = 0;
 	sbi->flags = AUTOFS_SBI_CATATONIC;
+	sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+	sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+	sbi->pipefd = -1;
+
 	set_autofs_type_indirect(&sbi->type);
-	sbi->min_proto = 0;
-	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
 	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
-	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
 	INIT_LIST_HEAD(&sbi->active_list);
 	INIT_LIST_HEAD(&sbi->expiring_list);
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
-	s->s_magic = AUTOFS_SUPER_MAGIC;
-	s->s_op = &autofs_sops;
-	s->s_d_op = &autofs_dentry_operations;
-	s->s_time_gran = 1;
-
-	/*
-	 * Get the root inode and dentry, but defer checking for errors.
-	 */
-	ino = autofs_new_ino(sbi);
-	if (!ino) {
-		ret = -ENOMEM;
-		goto fail_free;
-	}
-	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
-	root = d_make_root(root_inode);
-	if (!root) {
-		ret = -ENOMEM;
-		goto fail_ino;
-	}
-	pipe = NULL;
 
-	root->d_fsdata = ino;
+	return sbi;
+}
 
-	/* Can this call block? */
-	if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
-		pr_err("called with bogus options\n");
-		goto fail_dput;
-	}
+static int autofs_validate_protocol(struct fs_context *fc)
+{
+	struct autofs_sb_info *sbi = fc->s_fs_info;
 
 	/* Test versions first */
 	if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
 	    sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
-		pr_err("kernel does not match daemon version "
+		errorf(fc, "kernel does not match daemon version "
 		       "daemon (%d, %d) kernel (%d, %d)\n",
 		       sbi->min_proto, sbi->max_proto,
 		       AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
-		goto fail_dput;
+		return -EINVAL;
 	}
 
 	/* Establish highest kernel protocol version */
@@ -300,62 +289,148 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 		sbi->version = AUTOFS_MAX_PROTO_VERSION;
 	else
 		sbi->version = sbi->max_proto;
-	sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
-
-	if (pgrp_set) {
-		sbi->oz_pgrp = find_get_pid(pgrp);
-		if (!sbi->oz_pgrp) {
-			pr_err("could not find process group %d\n",
-				pgrp);
-			goto fail_dput;
-		}
-	} else {
-		sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+
+	switch (sbi->version) {
+	case 4:
+		sbi->sub_version = 7;
+		break;
+	case 5:
+		sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+		break;
+	default:
+		sbi->sub_version = 0;
 	}
 
-	if (autofs_type_trigger(sbi->type))
-		__managed_dentry_set_managed(root);
+	return 0;
+}
 
+static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = s->s_fs_info;
+	struct inode *root_inode;
+	struct autofs_info *ino;
+
+	pr_debug("starting up, sbi = %p\n", sbi);
+
+	sbi->sb = s;
+	s->s_blocksize = 1024;
+	s->s_blocksize_bits = 10;
+	s->s_magic = AUTOFS_SUPER_MAGIC;
+	s->s_op = &autofs_sops;
+	s->s_d_op = &autofs_dentry_operations;
+	s->s_time_gran = 1;
+
+	/*
+	 * Get the root inode and dentry, but defer checking for errors.
+	 */
+	ino = autofs_new_ino(sbi);
+	if (!ino)
+		return -ENOMEM;
+
+	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
+	if (!root_inode)
+		return -ENOMEM;
+
+	root_inode->i_uid = ctx->uid;
+	root_inode->i_gid = ctx->gid;
 	root_inode->i_fop = &autofs_root_operations;
 	root_inode->i_op = &autofs_dir_inode_operations;
 
+	s->s_root = d_make_root(root_inode);
+	if (unlikely(!s->s_root)) {
+		autofs_free_ino(ino);
+		return -ENOMEM;
+	}
+	s->s_root->d_fsdata = ino;
+
+	if (ctx->pgrp_set) {
+		sbi->oz_pgrp = find_get_pid(ctx->pgrp);
+		if (!sbi->oz_pgrp)
+			return invalf(fc, "Could not find process group %d",
+				      ctx->pgrp);
+	} else
+		sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+
+	if (autofs_type_trigger(sbi->type))
+		/* s->s_root won't be contended so there's little to
+		 * be gained by not taking the d_lock when setting
+		 * d_flags, even when a lot mounts are being done.
+		 */
+		managed_dentry_set_managed(s->s_root);
+
 	pr_debug("pipe fd = %d, pgrp = %u\n",
 		 sbi->pipefd, pid_nr(sbi->oz_pgrp));
-	pipe = fget(sbi->pipefd);
 
-	if (!pipe) {
-		pr_err("could not open pipe file descriptor\n");
-		goto fail_put_pid;
-	}
-	ret = autofs_prepare_pipe(pipe);
-	if (ret < 0)
-		goto fail_fput;
-	sbi->pipe = pipe;
 	sbi->flags &= ~AUTOFS_SBI_CATATONIC;
+	return 0;
+}
 
-	/*
-	 * Success! Install the root dentry now to indicate completion.
-	 */
-	s->s_root = root;
+/*
+ * Validate the parameters and then request a superblock.
+ */
+static int autofs_get_tree(struct fs_context *fc)
+{
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+	int ret;
+
+	ret = autofs_validate_protocol(fc);
+	if (ret)
+		return ret;
+
+	if (sbi->pipefd < 0)
+		return invalf(fc, "No control pipe specified");
+
+	return get_tree_nodev(fc, autofs_fill_super);
+}
+
+static void autofs_free_fc(struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx = fc->fs_private;
+	struct autofs_sb_info *sbi = fc->s_fs_info;
+
+	if (sbi) {
+		if (sbi->pipe)
+			fput(sbi->pipe);
+		kfree(sbi);
+	}
+	kfree(ctx);
+}
+
+static const struct fs_context_operations autofs_context_ops = {
+	.free		= autofs_free_fc,
+	.parse_param	= autofs_parse_param,
+	.get_tree	= autofs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int autofs_init_fs_context(struct fs_context *fc)
+{
+	struct autofs_fs_context *ctx;
+	struct autofs_sb_info *sbi;
+
+	ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		goto nomem;
+
+	ctx->uid = current_uid();
+	ctx->gid = current_gid();
+
+	sbi = autofs_alloc_sbi();
+	if (!sbi)
+		goto nomem_ctx;
+
+	fc->fs_private = ctx;
+	fc->s_fs_info = sbi;
+	fc->ops = &autofs_context_ops;
 	return 0;
 
-	/*
-	 * Failure ... clean up.
-	 */
-fail_fput:
-	pr_err("pipe file descriptor does not contain proper ops\n");
-	fput(pipe);
-fail_put_pid:
-	put_pid(sbi->oz_pgrp);
-fail_dput:
-	dput(root);
-	goto fail_free;
-fail_ino:
-	autofs_free_ino(ino);
-fail_free:
-	kfree(sbi);
-	s->s_fs_info = NULL;
-	return ret;
+nomem_ctx:
+	kfree(ctx);
+nomem:
+	return -ENOMEM;
 }
 
 struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
@@ -370,7 +445,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
 		inode->i_uid = d_inode(sb->s_root)->i_uid;
 		inode->i_gid = d_inode(sb->s_root)->i_gid;
 	}
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_ino = get_next_ino();
 
 	if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 512b9a26c63d..530d18827e35 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return 0;
 }
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
 	d_inode(dentry)->i_size = 0;
 	clear_nlink(d_inode(dentry));
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	spin_lock(&sbi->lookup_lock);
 	__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
 	p_ino = autofs_dentry_ino(dentry->d_parent);
 	p_ino->count++;
 	inc_nlink(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return 0;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 83f9566c973b..316d88da2ce1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -208,7 +208,7 @@ void make_bad_inode(struct inode *inode)
 	remove_inode_hash(inode);
 
 	inode->i_mode = S_IFREG;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &bad_inode_ops;	
 	inode->i_opflags &= ~IOP_XATTR;
 	inode->i_fop = &bad_file_ops;	
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
new file mode 100644
index 000000000000..fddc7be58022
--- /dev/null
+++ b/fs/bcachefs/Kconfig
@@ -0,0 +1,95 @@
+
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support (EXPERIMENTAL)"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select CRC64
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select LZ4HC_COMPRESS
+	select LZ4HC_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	select RAID6_PQ
+	select XOR_BLOCKS
+	select XXHASH
+	select SRCU
+	select SYMBOLIC_ERRNAME
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+
+config BCACHEFS_ERASURE_CODING
+	bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+	help
+	This enables the "erasure_code" filesysystem and inode option, which
+	organizes data into reed-solomon stripes instead of ordinary
+	replication.
+
+	WARNING: this feature is still undergoing on disk format changes, and
+	should only be enabled for testing purposes.
+
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG_TRANSACTIONS
+	bool "bcachefs runtime info"
+	depends on BCACHEFS_FS
+	help
+	This makes the list of running btree transactions available in debugfs.
+
+	This is a highly useful debugging feature but does add a small amount of overhead.
+
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
+
+config BCACHEFS_LOCK_TIME_STATS
+       bool "bcachefs lock time statistics"
+       depends on BCACHEFS_FS
+       help
+       Expose statistics for how long we held a lock in debugfs
+
+config BCACHEFS_NO_LATENCY_ACCT
+	bool "disable latency accounting and time stats"
+	depends on BCACHEFS_FS
+	help
+	This disables device latency tracking and time stats, only for performance testing
+
+config MEAN_AND_VARIANCE_UNIT_TEST
+	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	depends on BCACHEFS_FS
+	default KUNIT_ALL_TESTS
+	help
+	  This option enables the kunit tests for mean_and_variance module.
+	  If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
new file mode 100644
index 000000000000..b81268418174
--- /dev/null
+++ b/fs/bcachefs/Makefile
@@ -0,0 +1,91 @@
+
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+
+bcachefs-y		:=	\
+	acl.o			\
+	alloc_background.o	\
+	alloc_foreground.o	\
+	backpointers.o		\
+	bkey.o			\
+	bkey_methods.o		\
+	bkey_sort.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_journal_iter.o	\
+	btree_key_cache.o	\
+	btree_locking.o		\
+	btree_trans_commit.o	\
+	btree_update.o		\
+	btree_update_interior.o	\
+	btree_write_buffer.o	\
+	buckets.o		\
+	buckets_waiting_for_journal.o	\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	counters.o		\
+	darray.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	data_update.o		\
+	ec.o			\
+	errcode.o		\
+	error.o			\
+	extents.o		\
+	extent_update.o		\
+	fs.o			\
+	fs-common.o		\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fs-io-buffered.o	\
+	fs-io-direct.o		\
+	fs-io-pagecache.o	\
+	fsck.o			\
+	inode.o			\
+	io_read.o		\
+	io_misc.o		\
+	io_write.o		\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_sb.o		\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	logged_ops.o		\
+	lru.o			\
+	mean_and_variance.o	\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	nocow_locking.o		\
+	opts.o			\
+	printbuf.o		\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	reflink.o		\
+	replicas.o		\
+	sb-clean.o		\
+	sb-downgrade.o		\
+	sb-errors.o		\
+	sb-members.o		\
+	siphash.o		\
+	six.o			\
+	snapshot.o		\
+	subvolume.o		\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	two_state_shared_lock.o	\
+	util.o			\
+	varint.o		\
+	xattr.o
+
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
new file mode 100644
index 000000000000..3640f417cce1
--- /dev/null
+++ b/fs/bcachefs/acl.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+
+#include "acl.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+	[ACL_USER_OBJ]	= "user_obj",
+	[ACL_USER]	= "user",
+	[ACL_GROUP_OBJ]	= "group_obj",
+	[ACL_GROUP]	= "group",
+	[ACL_MASK]	= "mask",
+	[ACL_OTHER]	= "other",
+	NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+
+	if (!value ||
+	    size < sizeof(bch_acl_header) ||
+	    ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+		return;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		unsigned tag = le16_to_cpu(in->e_tag);
+
+		prt_str(out, acl_types[tag]);
+
+		switch (tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+		if (p != end)
+			prt_char(out, ' ');
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans,
+					    const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+	int ret;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+
+		count++;
+	}
+
+	if (p > end)
+		goto invalid;
+
+	if (!count)
+		return NULL;
+
+	acl = allocate_dropping_locks(trans, ret,
+			posix_acl_alloc(count, _gfp));
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	if (ret) {
+		kfree(acl);
+		return ERR_PTR(ret);
+	}
+
+	out = acl->a_entries;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		out++;
+	}
+
+	BUG_ON(out != acl->a_entries + acl->a_count);
+
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0;
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+	outptr = (void *) acl_header + sizeof(*acl_header);
+
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+	return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+	struct bkey_s_c k;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+			&hash, inode_inum(inode), &search, 0);
+	if (ret) {
+		if (!bch2_err_matches(ret, ENOENT))
+			acl = ERR_PTR(ret);
+		goto out;
+	}
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret) {
+		acl = ERR_PTR(ret);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(k);
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+		       struct bch_inode_unpacked *inode_u,
+		       struct posix_acl *acl, int type)
+{
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
+	int ret;
+
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+				    inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+				       inum, &search);
+	}
+
+	return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *_acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl;
+	umode_t mode;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+retry:
+	bch2_trans_begin(trans);
+	acl = _acl;
+
+	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto btree_err;
+
+	mode = inode_u.bi_mode;
+
+	if (type == ACL_TYPE_ACCESS) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
+	if (ret)
+		goto btree_err;
+
+	inode_u.bi_ctime	= bch2_current_time(c);
+	inode_u.bi_mode		= mode;
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(trans, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+
+	set_cached_acl(&inode->v, type, acl);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
+	struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0);
+	struct btree_iter iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl = NULL;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+			       &hash_info, inum, &search, BTREE_ITER_INTENT);
+	if (ret)
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	xattr = bkey_s_c_to_xattr(k);
+
+	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	ret = PTR_ERR_OR_ZERO(acl);
+	if (IS_ERR_OR_NULL(acl))
+		goto err;
+
+	ret = allocate_dropping_locks_errcode(trans,
+				__posix_acl_chmod(&acl, _gfp, mode));
+	if (ret)
+		goto err;
+
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+
+	new->k.p = iter.pos;
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (!IS_ERR_OR_NULL(acl))
+		kfree(acl);
+	return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
new file mode 100644
index 000000000000..27e7eec0f278
--- /dev/null
+++ b/fs/bcachefs/acl.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
+		       struct bch_inode_unpacked *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *,
+		   umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
+				     struct bch_inode_unpacked *inode_u,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
+				 struct bch_inode_unpacked *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
new file mode 100644
index 000000000000..1fec0e67891f
--- /dev/null
+++ b/fs/bcachefs/alloc_background.c
@@ -0,0 +1,2159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+#include "trace.h"
+#include "varint.h"
+
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+
+/* Persistent alloc info: */
+
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bkey_alloc_unpacked {
+	u64		journal_seq;
+	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
+	bool		need_discard:1;
+	bool		need_inc_gen:1;
+#define x(_name, _bits)	u##_bits _name;
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+};
+
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+				     const void **p, unsigned field)
+{
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
+	u64 v;
+
+	if (!(a->fields & (1 << field)))
+		return 0;
+
+	switch (bytes) {
+	case 1:
+		v = *((const u8 *) *p);
+		break;
+	case 2:
+		v = le16_to_cpup(*p);
+		break;
+	case 4:
+		v = le32_to_cpup(*p);
+		break;
+	case 8:
+		v = le64_to_cpup(*p);
+		break;
+	default:
+		BUG();
+	}
+
+	*p += bytes;
+	return v;
+}
+
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+				 struct bkey_s_c k)
+{
+	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+	const void *d = in->data;
+	unsigned idx = 0;
+
+	out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
+
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
+	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
+	out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked ret = { .gen	= 0 };
+
+	switch (k.k->type) {
+	case KEY_TYPE_alloc:
+		bch2_alloc_unpack_v1(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v2:
+		bch2_alloc_unpack_v2(&ret, k);
+		break;
+	case KEY_TYPE_alloc_v3:
+		bch2_alloc_unpack_v3(&ret, k);
+		break;
+	}
+
+	return ret;
+}
+
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
+{
+	unsigned i, bytes = offsetof(struct bch_alloc, data);
+
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
+		if (a->fields & (1 << i))
+			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
+
+	return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+	int ret = 0;
+
+	/* allow for unknown fields */
+	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
+			 alloc_v1_val_size_bad,
+			 "incorrect value size (%zu < %u)",
+			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_alloc_unpacked u;
+	int ret = 0;
+
+	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
+			 alloc_v2_unpack_error,
+			 "unpack error");
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_alloc_unpacked u;
+	int ret = 0;
+
+	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
+			 alloc_v2_unpack_error,
+			 "unpack error");
+fsck_err:
+	return ret;
+}
+
+int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+			 alloc_v4_val_size_bad,
+			 "bad val size (%u > %zu)",
+			 alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
+
+	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
+			 alloc_v4_backpointers_start_bad,
+			 "invalid backpointers_start");
+
+	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
+			 alloc_key_data_type_bad,
+			 "invalid data type (got %u should be %u)",
+			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+
+	switch (a.v->data_type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		bkey_fsck_err_on(a.v->dirty_sectors ||
+				 a.v->cached_sectors ||
+				 a.v->stripe, c, err,
+				 alloc_key_empty_but_have_data,
+				 "empty data type free but have data");
+		break;
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+	case BCH_DATA_btree:
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
+				 alloc_key_dirty_sectors_0,
+				 "data_type %s but dirty_sectors==0",
+				 bch2_data_types[a.v->data_type]);
+		break;
+	case BCH_DATA_cached:
+		bkey_fsck_err_on(!a.v->cached_sectors ||
+				 a.v->dirty_sectors ||
+				 a.v->stripe, c, err,
+				 alloc_key_cached_inconsistency,
+				 "data type inconsistency");
+
+		bkey_fsck_err_on(!a.v->io_time[READ] &&
+				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+				 c, err,
+				 alloc_key_cached_but_read_time_zero,
+				 "cached bucket with read_time == 0");
+		break;
+	case BCH_DATA_stripe:
+		break;
+	}
+fsck_err:
+	return ret;
+}
+
+void bch2_alloc_v4_swab(struct bkey_s k)
+{
+	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+	struct bch_backpointer *bp, *bps;
+
+	a->journal_seq		= swab64(a->journal_seq);
+	a->flags		= swab32(a->flags);
+	a->dirty_sectors	= swab32(a->dirty_sectors);
+	a->cached_sectors	= swab32(a->cached_sectors);
+	a->io_time[0]		= swab64(a->io_time[0]);
+	a->io_time[1]		= swab64(a->io_time[1]);
+	a->stripe		= swab32(a->stripe);
+	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+	a->fragmentation_lru	= swab64(a->fragmentation_lru);
+
+	bps = alloc_v4_backpointers(a);
+	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+		bp->bucket_offset	= swab40(bp->bucket_offset);
+		bp->bucket_len		= swab32(bp->bucket_len);
+		bch2_bpos_swab(&bp->pos);
+	}
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+	unsigned i;
+
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "gen %u oldest_gen %u data_type %s",
+	       a->gen, a->oldest_gen,
+	       a->data_type < BCH_DATA_NR
+	       ? bch2_data_types[a->data_type]
+	       : "(invalid data type)");
+	prt_newline(out);
+	prt_printf(out, "journal_seq       %llu",	a->journal_seq);
+	prt_newline(out);
+	prt_printf(out, "need_discard      %llu",	BCH_ALLOC_V4_NEED_DISCARD(a));
+	prt_newline(out);
+	prt_printf(out, "need_inc_gen      %llu",	BCH_ALLOC_V4_NEED_INC_GEN(a));
+	prt_newline(out);
+	prt_printf(out, "dirty_sectors     %u",	a->dirty_sectors);
+	prt_newline(out);
+	prt_printf(out, "cached_sectors    %u",	a->cached_sectors);
+	prt_newline(out);
+	prt_printf(out, "stripe            %u",	a->stripe);
+	prt_newline(out);
+	prt_printf(out, "stripe_redundancy %u",	a->stripe_redundancy);
+	prt_newline(out);
+	prt_printf(out, "io_time[READ]     %llu",	a->io_time[READ]);
+	prt_newline(out);
+	prt_printf(out, "io_time[WRITE]    %llu",	a->io_time[WRITE]);
+	prt_newline(out);
+	prt_printf(out, "fragmentation     %llu",	a->fragmentation_lru);
+	prt_newline(out);
+	prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+	prt_newline(out);
+
+	if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+		struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+		const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
+
+		prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+		printbuf_indent_add(out, 2);
+
+		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+			prt_newline(out);
+			bch2_backpointer_to_text(out, &bps[i]);
+		}
+
+		printbuf_indent_sub(out, 2);
+	}
+
+	printbuf_indent_sub(out, 2);
+}
+
+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
+{
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		*out = *bkey_s_c_to_alloc_v4(k).v;
+
+		src = alloc_v4_backpointers(out);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(out);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
+	} else {
+		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+		*out = (struct bch_alloc_v4) {
+			.journal_seq		= u.journal_seq,
+			.flags			= u.need_discard,
+			.gen			= u.gen,
+			.oldest_gen		= u.oldest_gen,
+			.data_type		= u.data_type,
+			.stripe_redundancy	= u.stripe_redundancy,
+			.dirty_sectors		= u.dirty_sectors,
+			.cached_sectors		= u.cached_sectors,
+			.io_time[READ]		= u.read_time,
+			.io_time[WRITE]		= u.write_time,
+			.stripe			= u.stripe,
+		};
+
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
+	}
+}
+
+static noinline struct bkey_i_alloc_v4 *
+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_i_alloc_v4 *ret;
+
+	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
+	if (IS_ERR(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_alloc_v4) {
+		void *src, *dst;
+
+		bkey_reassemble(&ret->k_i, k);
+
+		src = alloc_v4_backpointers(&ret->v);
+		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
+		dst = alloc_v4_backpointers(&ret->v);
+
+		if (src < dst)
+			memset(src, 0, dst - src);
+
+		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
+		set_alloc_v4_u64s(ret);
+	} else {
+		bkey_alloc_v4_init(&ret->k_i);
+		ret->k.p = k.k->p;
+		bch2_alloc_to_v4(k, &ret->v);
+	}
+	return ret;
+}
+
+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v4 a;
+
+	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
+	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
+	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
+		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
+
+	return __bch2_alloc_to_v4_mut(trans, k);
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return bch2_alloc_to_v4_mut_inlined(trans, k);
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
+			      struct bpos pos)
+{
+	struct bkey_s_c k;
+	struct bkey_i_alloc_v4 *a;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	a = bch2_alloc_to_v4_mut_inlined(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (unlikely(ret))
+		goto err;
+	return a;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ERR_PTR(ret);
+}
+
+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
+{
+	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
+
+	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
+	return pos;
+}
+
+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
+{
+	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
+	pos.offset += offset;
+	return pos;
+}
+
+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
+{
+	return k.k->type == KEY_TYPE_bucket_gens
+		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
+		: 0;
+}
+
+int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
+			 bucket_gens_val_size_bad,
+			 "bad val size (%zu != %zu)",
+			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+	return ret;
+}
+
+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
+		if (i)
+			prt_char(out, ' ');
+		prt_printf(out, "%u", g.v->gens[i]);
+	}
+}
+
+int bch2_bucket_gens_init(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a;
+	struct bkey_i_bucket_gens g;
+	bool have_bucket_gens_key = false;
+	unsigned offset;
+	struct bpos pos;
+	u8 gen;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		/*
+		 * Not a fsck error because this is checked/repaired by
+		 * bch2_check_alloc_key() which runs later:
+		 */
+		if (!bch2_dev_bucket_exists(c, k.k->p))
+			continue;
+
+		gen = bch2_alloc_to_v4(k, &a)->gen;
+		pos = alloc_gens_pos(iter.pos, &offset);
+
+		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+			ret = commit_do(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW,
+				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+			if (ret)
+				break;
+			have_bucket_gens_key = false;
+		}
+
+		if (!have_bucket_gens_key) {
+			bkey_bucket_gens_init(&g.k_i);
+			g.k.p = pos;
+			have_bucket_gens_key = true;
+		}
+
+		g.v.gens[offset] = gen;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (have_bucket_gens_key && !ret)
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW,
+			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_alloc_read(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	int ret;
+
+	down_read(&c->gc_lock);
+
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
+		const struct bch_bucket_gens *g;
+		u64 b;
+
+		for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+
+			if (k.k->type != KEY_TYPE_bucket_gens)
+				continue;
+
+			g = bkey_s_c_to_bucket_gens(k).v;
+
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_exists2(c, k.k->p.inode))
+				continue;
+
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			for (b = max_t(u64, ca->mi.first_bucket, start);
+			     b < min_t(u64, ca->mi.nbuckets, end);
+			     b++)
+				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
+		}
+		bch2_trans_iter_exit(trans, &iter);
+	} else {
+		struct bch_alloc_v4 a;
+
+		for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+				   BTREE_ITER_PREFETCH, k, ret) {
+			/*
+			 * Not a fsck error because this is checked/repaired by
+			 * bch2_check_alloc_key() which runs later:
+			 */
+			if (!bch2_dev_bucket_exists(c, k.k->p))
+				continue;
+
+			ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
+		}
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_trans_put(trans);
+	up_read(&c->gc_lock);
+
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return ret;
+}
+
+/* Free space/discard btree: */
+
+static int bch2_bucket_do_index(struct btree_trans *trans,
+				struct bkey_s_c alloc_k,
+				const struct bch_alloc_v4 *a,
+				bool set)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	struct bkey_i *k;
+	enum btree_id btree;
+	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (a->data_type != BCH_DATA_free &&
+	    a->data_type != BCH_DATA_need_discard)
+		return 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.type = new_type;
+
+	switch (a->data_type) {
+	case BCH_DATA_free:
+		btree = BTREE_ID_freespace;
+		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
+		bch2_key_resize(&k->k, 1);
+		break;
+	case BCH_DATA_need_discard:
+		btree = BTREE_ID_need_discard;
+		k->k.p = alloc_k.k->p;
+		break;
+	default:
+		return 0;
+	}
+
+	old = bch2_bkey_get_iter(trans, &iter, btree,
+			     bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	ret = bkey_err(old);
+	if (ret)
+		return ret;
+
+	if (ca->mi.freespace_initialized &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
+	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
+			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
+			"  for %s",
+			set ? "setting" : "clearing",
+			bch2_btree_id_str(btree),
+			iter.pos.inode,
+			iter.pos.offset,
+			bch2_bkey_types[old.k->type],
+			bch2_bkey_types[old_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
+					   struct bpos bucket, u8 gen)
+{
+	struct btree_iter iter;
+	unsigned offset;
+	struct bpos pos = alloc_gens_pos(bucket, &offset);
+	struct bkey_i_bucket_gens *g;
+	struct bkey_s_c k;
+	int ret;
+
+	g = bch2_trans_kmalloc(trans, sizeof(*g));
+	ret = PTR_ERR_OR_ZERO(g);
+	if (ret)
+		return ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_WITH_UPDATES);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (k.k->type != KEY_TYPE_bucket_gens) {
+		bkey_bucket_gens_init(&g->k_i);
+		g->k.p = iter.pos;
+	} else {
+		bkey_reassemble(&g->k_i, k);
+	}
+
+	g->v.gens[offset] = gen;
+
+	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old, struct bkey_i *new,
+			  unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 old_a_convert, *new_a;
+	const struct bch_alloc_v4 *old_a;
+	u64 old_lru, new_lru;
+	int ret = 0;
+
+	/*
+	 * Deletion only happens in the device removal path, with
+	 * BTREE_TRIGGER_NORUN:
+	 */
+	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	new_a = &bkey_i_to_alloc_v4(new)->v;
+
+	new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+
+	if (new_a->dirty_sectors > old_a->dirty_sectors ||
+	    new_a->cached_sectors > old_a->cached_sectors) {
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+	}
+
+	if (data_type_is_empty(new_a->data_type) &&
+	    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+		new_a->gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+	}
+
+	if (old_a->data_type != new_a->data_type ||
+	    (new_a->data_type == BCH_DATA_free &&
+	     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
+			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
+		if (ret)
+			return ret;
+	}
+
+	if (new_a->data_type == BCH_DATA_cached &&
+	    !new_a->io_time[READ])
+		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+	old_lru = alloc_lru_idx_read(*old_a);
+	new_lru = alloc_lru_idx_read(*new_a);
+
+	if (old_lru != new_lru) {
+		ret = bch2_lru_change(trans, new->k.p.inode,
+				      bucket_to_u64(new->k.p),
+				      old_lru, new_lru);
+		if (ret)
+			return ret;
+	}
+
+	new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+					bch_dev_bkey_exists(c, new->k.p.inode));
+
+	if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+		ret = bch2_lru_change(trans,
+				BCH_LRU_FRAGMENTATION_START,
+				bucket_to_u64(new->k.p),
+				old_a->fragmentation_lru, new_a->fragmentation_lru);
+		if (ret)
+			return ret;
+	}
+
+	if (old_a->gen != new_a->gen) {
+		ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for
+ * extents style btrees, but works on non-extents btrees:
+ */
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+{
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+	if (bkey_err(k))
+		return k;
+
+	if (k.k->type) {
+		return k;
+	} else {
+		struct btree_iter iter2;
+		struct bpos next;
+
+		bch2_trans_copy_iter(&iter2, iter);
+
+		if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
+			end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+
+		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
+
+		/*
+		 * btree node min/max is a closed interval, upto takes a half
+		 * open interval:
+		 */
+		k = bch2_btree_iter_peek_upto(&iter2, end);
+		next = iter2.pos;
+		bch2_trans_iter_exit(iter->trans, &iter2);
+
+		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
+
+		if (bkey_err(k))
+			return k;
+
+		bkey_init(hole);
+		hole->p = iter->pos;
+
+		bch2_key_resize(hole, next.offset - iter->pos.offset);
+		return (struct bkey_s_c) { hole, NULL };
+	}
+}
+
+static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
+{
+	struct bch_dev *ca;
+	unsigned iter;
+
+	if (bch2_dev_bucket_exists(c, *bucket))
+		return true;
+
+	if (bch2_dev_exists2(c, bucket->inode)) {
+		ca = bch_dev_bkey_exists(c, bucket->inode);
+
+		if (bucket->offset < ca->mi.first_bucket) {
+			bucket->offset = ca->mi.first_bucket;
+			return true;
+		}
+
+		bucket->inode++;
+		bucket->offset = 0;
+	}
+
+	rcu_read_lock();
+	iter = bucket->inode;
+	ca = __bch2_next_dev(c, &iter, NULL);
+	if (ca)
+		*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
+	rcu_read_unlock();
+
+	return ca != NULL;
+}
+
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole)
+{
+	struct bch_fs *c = iter->trans->c;
+	struct bkey_s_c k;
+again:
+	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+	if (bkey_err(k))
+		return k;
+
+	if (!k.k->type) {
+		struct bpos bucket = bkey_start_pos(k.k);
+
+		if (!bch2_dev_bucket_exists(c, bucket)) {
+			if (!next_bucket(c, &bucket))
+				return bkey_s_c_null;
+
+			bch2_btree_iter_set_pos(iter, bucket);
+			goto again;
+		}
+
+		if (!bch2_dev_bucket_exists(c, k.k->p)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+			bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset);
+		}
+	}
+
+	return k;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_key(struct btree_trans *trans,
+			 struct bkey_s_c alloc_k,
+			 struct btree_iter *alloc_iter,
+			 struct btree_iter *discard_iter,
+			 struct btree_iter *freespace_iter,
+			 struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	unsigned discard_key_type, freespace_key_type;
+	unsigned gens_offset;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+			alloc_key_to_missing_dev_bucket,
+			"alloc key for invalid device:bucket %llu:%llu",
+			alloc_k.k->p.inode, alloc_k.k->p.offset))
+		return bch2_btree_delete_at(trans, alloc_iter, 0);
+
+	ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
+	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
+	k = bch2_btree_iter_peek_slot(discard_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != discard_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, need_discard_key_wrong,
+		      "incorrect key in need_discard btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[discard_key_type],
+		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= discard_key_type;
+		update->k.p	= discard_iter->pos;
+
+		ret = bch2_trans_update(trans, discard_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+
+	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
+	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != freespace_key_type &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, freespace_key_wrong,
+		      "incorrect key in freespace btree (got %s should be %s)\n"
+		      "  %s",
+		      bch2_bkey_types[k.k->type],
+		      bch2_bkey_types[freespace_key_type],
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= freespace_key_type;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k, 1);
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (a->gen != alloc_gen(k, gens_offset) &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, bucket_gens_key_wrong,
+		      "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+		      "  %s",
+		      alloc_gen(k, gens_offset), a->gen,
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+		struct bkey_i_bucket_gens *g =
+			bch2_trans_kmalloc(trans, sizeof(*g));
+
+		ret = PTR_ERR_OR_ZERO(g);
+		if (ret)
+			goto err;
+
+		if (k.k->type == KEY_TYPE_bucket_gens) {
+			bkey_reassemble(&g->k_i, k);
+		} else {
+			bkey_bucket_gens_init(&g->k_i);
+			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
+		}
+
+		g->v.gens[gens_offset] = a->gen;
+
+		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
+				    struct bpos start,
+				    struct bpos *end,
+				    struct btree_iter *freespace_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	ca = bch_dev_bkey_exists(c, start.inode);
+	if (!ca->mi.freespace_initialized)
+		return 0;
+
+	bch2_btree_iter_set_pos(freespace_iter, start);
+
+	k = bch2_btree_iter_peek_slot(freespace_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	*end = bkey_min(k.k->p, *end);
+
+	if (k.k->type != KEY_TYPE_set &&
+	    (c->opts.reconstruct_alloc ||
+	     fsck_err(c, freespace_hole_missing,
+		      "hole in alloc btree missing in freespace btree\n"
+		      "  device %llu buckets %llu-%llu",
+		      freespace_iter->pos.inode,
+		      freespace_iter->pos.offset,
+		      end->offset))) {
+		struct bkey_i *update =
+			bch2_trans_kmalloc(trans, sizeof(*update));
+
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_init(&update->k);
+		update->k.type	= KEY_TYPE_set;
+		update->k.p	= freespace_iter->pos;
+		bch2_key_resize(&update->k,
+				min_t(u64, U32_MAX, end->offset -
+				      freespace_iter->pos.offset));
+
+		ret = bch2_trans_update(trans, freespace_iter, update, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack
+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
+				      struct bpos start,
+				      struct bpos *end,
+				      struct btree_iter *bucket_gens_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, gens_offset, gens_end_offset;
+	int ret;
+
+	if (c->sb.version < bcachefs_metadata_version_bucket_gens)
+		return 0;
+
+	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+
+	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
+		     alloc_gens_pos(*end,  &gens_end_offset)))
+		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
+
+	if (k.k->type == KEY_TYPE_bucket_gens) {
+		struct bkey_i_bucket_gens g;
+		bool need_update = false;
+
+		bkey_reassemble(&g.k_i, k);
+
+		for (i = gens_offset; i < gens_end_offset; i++) {
+			if (fsck_err_on(g.v.gens[i], c,
+					bucket_gens_hole_wrong,
+					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
+					bucket_gens_pos_to_alloc(k.k->p, i).inode,
+					bucket_gens_pos_to_alloc(k.k->p, i).offset,
+					g.v.gens[i])) {
+				g.v.gens[i] = 0;
+				need_update = true;
+			}
+		}
+
+		if (need_update) {
+			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			memcpy(u, &g, sizeof(g));
+
+			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
+			if (ret)
+				goto err;
+		}
+	}
+
+	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter;
+	struct bkey_s_c alloc_k;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	u64 genbits;
+	struct bpos pos;
+	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
+		? BCH_DATA_need_discard
+		: BCH_DATA_free;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	pos = iter->pos;
+	pos.offset &= ~(~0ULL << 56);
+	genbits = iter->pos.offset & (~0ULL << 56);
+
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+			need_discard_freespace_key_to_invalid_dev_bucket,
+			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
+			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
+		goto delete;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (fsck_err_on(a->data_type != state ||
+			(state == BCH_DATA_free &&
+			 genbits != alloc_freespace_genbits(*a)), c,
+			need_discard_freespace_key_bad,
+			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+			bch2_btree_id_str(iter->btree_id),
+			iter->pos.inode,
+			iter->pos.offset,
+			a->data_type == state,
+			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
+		goto delete;
+out:
+fsck_err:
+	set_btree_iter_dontneed(&alloc_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+delete:
+	ret =   bch2_btree_delete_extent_at(trans, iter,
+			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+			BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+	goto out;
+}
+
+static int bch2_check_discard_freespace_key(struct btree_trans *trans,
+					    struct btree_iter *iter,
+					    struct bpos end)
+{
+	if (!btree_id_is_extents(iter->btree_id)) {
+		return __bch2_check_discard_freespace_key(trans, iter);
+	} else {
+		int ret = 0;
+
+		while (!bkey_eq(iter->pos, end) &&
+		       !(ret = btree_trans_too_many_iters(trans) ?:
+			       __bch2_check_discard_freespace_key(trans, iter)))
+			bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+
+		return ret;
+	}
+}
+
+/*
+ * We've already checked that generation numbers in the bucket_gens btree are
+ * valid for buckets that exist; this just checks for keys for nonexistent
+ * buckets.
+ */
+static noinline_for_stack
+int bch2_check_bucket_gens_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_bucket_gens g;
+	struct bch_dev *ca;
+	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
+	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
+	u64 b;
+	bool need_update = false, dev_exists;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
+	bkey_reassemble(&g.k_i, k);
+
+	/* if no bch_dev, skip out whether we repair or not */
+	dev_exists = bch2_dev_exists2(c, k.k->p.inode);
+	if (!dev_exists) {
+		if (fsck_err_on(!dev_exists, c,
+				bucket_gens_to_invalid_dev,
+				"bucket_gens key for invalid device:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_delete_at(trans, iter, 0);
+		}
+		goto out;
+	}
+
+	ca = bch_dev_bkey_exists(c, k.k->p.inode);
+	if (fsck_err_on(end <= ca->mi.first_bucket ||
+			start >= ca->mi.nbuckets, c,
+			bucket_gens_to_invalid_buckets,
+			"bucket_gens key for invalid buckets:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	for (b = start; b < ca->mi.first_bucket; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				bucket_gens_nonzero_for_invalid_buckets,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	for (b = ca->mi.nbuckets; b < end; b++)
+		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+				bucket_gens_nonzero_for_invalid_buckets,
+				"bucket_gens key has nonzero gen for invalid bucket")) {
+			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
+			need_update = true;
+		}
+
+	if (need_update) {
+		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
+
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto out;
+
+		memcpy(u, &g, sizeof(g));
+		ret = bch2_trans_update(trans, iter, u, 0);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_info(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
+	struct bkey hole;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+
+	while (1) {
+		struct bpos next;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (!k.k)
+			break;
+
+		if (k.k->type) {
+			next = bpos_nosnap_successor(k.k->p);
+
+			ret = bch2_check_alloc_key(trans,
+						   k, &iter,
+						   &discard_iter,
+						   &freespace_iter,
+						   &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		} else {
+			next = k.k->p;
+
+			ret = bch2_check_alloc_hole_freespace(trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &freespace_iter) ?:
+				bch2_check_alloc_hole_bucket_gens(trans,
+						    bkey_start_pos(k.k),
+						    &next,
+						    &bucket_gens_iter);
+			if (ret)
+				goto bkey_err;
+		}
+
+		ret = bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_pos(&iter, next);
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &bucket_gens_iter);
+	bch2_trans_iter_exit(trans, &freespace_iter);
+	bch2_trans_iter_exit(trans, &discard_iter);
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret < 0)
+		goto err;
+
+	ret = for_each_btree_key2(trans, iter,
+			BTREE_ID_need_discard, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key2(trans, iter,
+			BTREE_ID_freespace, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+	      for_each_btree_key_commit(trans, iter,
+			BTREE_ID_bucket_gens, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+		bch2_check_bucket_gens_key(trans, &iter, k));
+err:
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
+				       struct btree_iter *alloc_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter lru_iter;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c alloc_k, lru_k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	alloc_k = bch2_btree_iter_peek(alloc_iter);
+	if (!alloc_k.k)
+		return 0;
+
+	ret = bkey_err(alloc_k);
+	if (ret)
+		return ret;
+
+	a = bch2_alloc_to_v4(alloc_k, &a_convert);
+
+	if (a->data_type != BCH_DATA_cached)
+		return 0;
+
+	lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+			     lru_pos(alloc_k.k->p.inode,
+				     bucket_to_u64(alloc_k.k->p),
+				     a->io_time[READ]), 0);
+	ret = bkey_err(lru_k);
+	if (ret)
+		return ret;
+
+	if (fsck_err_on(!a->io_time[READ], c,
+			alloc_key_cached_but_read_time_zero,
+			"cached bucket with read_time 0\n"
+			"  %s",
+		(printbuf_reset(&buf),
+		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
+	    fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+			alloc_key_to_missing_lru_entry,
+			"missing lru entry\n"
+			"  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		u64 read_time = a->io_time[READ] ?:
+			atomic64_read(&c->io_clock[READ].now);
+
+		ret = bch2_lru_set(trans,
+				   alloc_k.k->p.inode,
+				   bucket_to_u64(alloc_k.k->p),
+				   read_time);
+		if (ret)
+			goto err;
+
+		if (a->io_time[READ] != read_time) {
+			struct bkey_i_alloc_v4 *a_mut =
+				bch2_alloc_to_v4_mut(trans, alloc_k);
+			ret = PTR_ERR_OR_ZERO(a_mut);
+			if (ret)
+				goto err;
+
+			a_mut->v.io_time[READ] = read_time;
+			ret = bch2_trans_update(trans, alloc_iter,
+						&a_mut->k_i, BTREE_TRIGGER_NORUN);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &lru_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_alloc_to_lru_ref(trans, &iter)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_discard_one_bucket(struct btree_trans *trans,
+				   struct btree_iter *need_discard_iter,
+				   struct bpos *discard_pos_done,
+				   u64 *seen,
+				   u64 *open,
+				   u64 *need_journal_commit,
+				   u64 *discarded)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos pos = need_discard_iter->pos;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	struct bkey_i_alloc_v4 *a;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	if (!percpu_ref_tryget(&ca->io_ref)) {
+		bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
+		return 0;
+	}
+
+	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
+		(*open)++;
+		goto out;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk,
+			pos.inode, pos.offset)) {
+		(*need_journal_commit)++;
+		goto out;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       need_discard_iter->pos,
+			       BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+		a->v.gen++;
+		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+		goto write;
+	}
+
+	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+			bch2_trans_inconsistent(trans,
+				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
+				"%s",
+				a->v.journal_seq,
+				c->journal.flushed_seq_ondisk,
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
+		goto out;
+	}
+
+	if (a->v.data_type != BCH_DATA_need_discard) {
+		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+			bch2_trans_inconsistent(trans,
+				"bucket incorrectly set in need_discard btree\n"
+				"%s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+			ret = -EIO;
+		}
+
+		goto out;
+	}
+
+	if (!bkey_eq(*discard_pos_done, iter.pos) &&
+	    ca->mi.discard && !c->opts.nochanges) {
+		/*
+		 * This works without any other locks because this is the only
+		 * thread that removes items from the need_discard tree
+		 */
+		bch2_trans_unlock(trans);
+		blkdev_issue_discard(ca->disk_sb.bdev,
+				     k.k->p.offset * ca->mi.bucket_size,
+				     ca->mi.bucket_size,
+				     GFP_KERNEL);
+		*discard_pos_done = iter.pos;
+
+		ret = bch2_trans_relock_notrace(trans);
+		if (ret)
+			goto out;
+	}
+
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+write:
+	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto out;
+
+	this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
+	(*discarded)++;
+out:
+	(*seen)++;
+	bch2_trans_iter_exit(trans, &iter);
+	percpu_ref_put(&ca->io_ref);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_do_discards_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+	struct bpos discard_pos_done = POS_MAX;
+	int ret;
+
+	/*
+	 * We're doing the commit in bch2_discard_one_bucket instead of using
+	 * for_each_btree_key_commit() so that we can increment counters after
+	 * successful commit:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_need_discard, POS_MIN, 0, k,
+			bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+						&seen,
+						&open,
+						&need_journal_commit,
+						&discarded)));
+
+	if (need_journal_commit * 2 > seen)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+
+	trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+			      bch2_err_str(ret));
+}
+
+void bch2_do_discards(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
+	    !queue_work(c->write_ref_wq, &c->discard_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+}
+
+static int invalidate_one_bucket(struct btree_trans *trans,
+				 struct btree_iter *lru_iter,
+				 struct bkey_s_c lru_k,
+				 s64 *nr_to_invalidate)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bkey_i_alloc_v4 *a = NULL;
+	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
+	unsigned cached_sectors;
+	int ret = 0;
+
+	if (*nr_to_invalidate <= 0)
+		return 1;
+
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		prt_str(&buf, "lru entry points to invalid bucket");
+		goto err;
+	}
+
+	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
+		return 0;
+
+	a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto out;
+
+	/* We expect harmless races here due to the btree write buffer: */
+	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+		goto out;
+
+	BUG_ON(a->v.data_type != BCH_DATA_cached);
+
+	if (!a->v.cached_sectors)
+		bch_err(c, "invalidating empty bucket, confused");
+
+	cached_sectors = a->v.cached_sectors;
+
+	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+	a->v.gen++;
+	a->v.data_type		= 0;
+	a->v.dirty_sectors	= 0;
+	a->v.cached_sectors	= 0;
+	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
+	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
+
+	ret =   bch2_trans_update(trans, &alloc_iter, &a->k_i,
+				BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_WATERMARK_btree|
+				  BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto out;
+
+	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
+	--*nr_to_invalidate;
+out:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+err:
+	prt_str(&buf, "\n  lru key: ");
+	bch2_bkey_val_to_text(&buf, c, lru_k);
+
+	prt_str(&buf, "\n  lru entry: ");
+	bch2_lru_pos_to_text(&buf, lru_iter->pos);
+
+	prt_str(&buf, "\n  alloc key: ");
+	if (!a)
+		bch2_bpos_to_text(&buf, bucket);
+	else
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
+
+	bch_err(c, "%s", buf.buf);
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
+		bch2_inconsistent_error(c);
+		ret = -EINVAL;
+	}
+
+	goto out;
+}
+
+static void bch2_do_invalidates_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+	struct bch_dev *ca;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned i;
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret)
+		goto err;
+
+	for_each_member_device(ca, c, i) {
+		s64 nr_to_invalidate =
+			should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+
+		ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+				lru_pos(ca->dev_idx, 0, 0),
+				lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
+				BTREE_ITER_INTENT, k,
+			invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
+
+		if (ret < 0) {
+			percpu_ref_put(&ca->ref);
+			break;
+		}
+	}
+err:
+	bch2_trans_put(trans);
+	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
+	    !queue_work(c->write_ref_wq, &c->invalidate_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+}
+
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+			    u64 bucket_start, u64 bucket_end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey hole;
+	struct bpos end = POS(ca->dev_idx, bucket_end);
+	struct bch_member *m;
+	unsigned long last_updated = jiffies;
+	int ret;
+
+	BUG_ON(bucket_start > bucket_end);
+	BUG_ON(bucket_end > ca->mi.nbuckets);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+		BTREE_ITER_PREFETCH);
+	/*
+	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
+	 * freespace/need_discard/need_gc_gens btrees as needed:
+	 */
+	while (1) {
+		if (last_updated + HZ * 10 < jiffies) {
+			bch_info(ca, "%s: currently at %llu/%llu",
+				 __func__, iter.pos.offset, ca->mi.nbuckets);
+			last_updated = jiffies;
+		}
+
+		bch2_trans_begin(trans);
+
+		if (bkey_ge(iter.pos, end)) {
+			ret = 0;
+			break;
+		}
+
+		k = bch2_get_key_or_hole(&iter, end, &hole);
+		ret = bkey_err(k);
+		if (ret)
+			goto bkey_err;
+
+		if (k.k->type) {
+			/*
+			 * We process live keys in the alloc btree one at a
+			 * time:
+			 */
+			struct bch_alloc_v4 a_convert;
+			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+			ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_advance(&iter);
+		} else {
+			struct bkey_i *freespace;
+
+			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
+			ret = PTR_ERR_OR_ZERO(freespace);
+			if (ret)
+				goto bkey_err;
+
+			bkey_init(&freespace->k);
+			freespace->k.type	= KEY_TYPE_set;
+			freespace->k.p		= k.k->p;
+			freespace->k.size	= k.k->size;
+
+			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_LAZY_RW|
+						  BTREE_INSERT_NOFAIL);
+			if (ret)
+				goto bkey_err;
+
+			bch2_btree_iter_set_pos(&iter, k.k->p);
+		}
+bkey_err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (ret < 0) {
+		bch_err_msg(ca, ret, "initializing free space");
+		return ret;
+	}
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_fs_freespace_init(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+	bool doing_init = false;
+
+	/*
+	 * We can crash during the device add path, so we need to check this on
+	 * every mount:
+	 */
+
+	for_each_member_device(ca, c, i) {
+		if (ca->mi.freespace_initialized)
+			continue;
+
+		if (!doing_init) {
+			bch_info(c, "initializing freespace");
+			doing_init = true;
+		}
+
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+		if (ret) {
+			percpu_ref_put(&ca->ref);
+			bch_err_fn(c, ret);
+			return ret;
+		}
+	}
+
+	if (doing_init) {
+		mutex_lock(&c->sb_lock);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+		bch_verbose(c, "done initializing freespace");
+	}
+
+	return 0;
+}
+
+/* Bucket IO clocks: */
+
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+			      size_t bucket_nr, int rw)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	u64 now;
+	int ret = 0;
+
+	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	now = atomic64_read(&c->io_clock[rw].now);
+	if (a->v.io_time[rw] == now)
+		goto out;
+
+	a->v.io_time[rw] = now;
+
+	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* Startup/shutdown (ro/rw): */
+
+void bch2_recalc_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+	unsigned bucket_size_max = 0;
+	unsigned long ra_pages = 0;
+	unsigned i;
+
+	lockdep_assert_held(&c->state_lock);
+
+	for_each_online_member(ca, c, i) {
+		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
+
+		ra_pages += bdi->ra_pages;
+	}
+
+	bch2_set_ra_pages(c, ra_pages);
+
+	for_each_rw_member(ca, c, i) {
+		u64 dev_reserve = 0;
+
+		/*
+		 * We need to reserve buckets (from the number
+		 * of currently available buckets) against
+		 * foreground writes so that mainly copygc can
+		 * make forward progress.
+		 *
+		 * We need enough to refill the various reserves
+		 * from scratch - copygc will use its entire
+		 * reserve all at once, then run against when
+		 * its reserve is refilled (from the formerly
+		 * available buckets).
+		 *
+		 * This reserve is just used when considering if
+		 * allocations for foreground writes must wait -
+		 * not -ENOSPC calculations.
+		 */
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
+
+		dev_reserve += 1;	/* btree write point */
+		dev_reserve += 1;	/* copygc write point */
+		dev_reserve += 1;	/* rebalance write point */
+
+		dev_reserve *= ca->mi.bucket_size;
+
+		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
+					     ca->mi.first_bucket);
+
+		reserved_sectors += dev_reserve * 2;
+
+		bucket_size_max = max_t(unsigned, bucket_size_max,
+					ca->mi.bucket_size);
+	}
+
+	gc_reserve = c->opts.gc_reserve_bytes
+		? c->opts.gc_reserve_bytes >> 9
+		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
+
+	reserved_sectors = max(gc_reserve, reserved_sectors);
+
+	reserved_sectors = min(reserved_sectors, capacity);
+
+	c->capacity = capacity - reserved_sectors;
+
+	c->bucket_size_max = bucket_size_max;
+
+	/* Wake up case someone was waiting for buckets */
+	closure_wake_up(&c->freelist_wait);
+}
+
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	u64 ret = U64_MAX;
+
+	for_each_rw_member(ca, c, i)
+		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+	return ret;
+}
+
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct open_bucket *ob;
+	bool ret = false;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
+
+	return ret;
+}
+
+/* device goes ro: */
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	/* First, remove device from allocation groups: */
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		clear_bit(ca->dev_idx, c->rw_devs[i].d);
+
+	/*
+	 * Capacity is calculated based off of devices in allocation groups:
+	 */
+	bch2_recalc_capacity(c);
+
+	bch2_open_buckets_stop(c, ca, false);
+
+	/*
+	 * Wake up threads that were blocked on allocation, so they can notice
+	 * the device can no longer be removed and the capacity has changed:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	/*
+	 * journal_res_get() can block waiting for free space in the journal -
+	 * it needs to notice there may not be devices to allocate from anymore:
+	 */
+	wake_up(&c->journal.wait);
+
+	/* Now wait for any in flight writes: */
+
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
+}
+
+/* device goes rw: */
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+		if (ca->mi.data_allowed & (1 << i))
+			set_bit(ca->dev_idx, c->rw_devs[i].d);
+}
+
+void bch2_fs_allocator_background_init(struct bch_fs *c)
+{
+	spin_lock_init(&c->freelist_lock);
+	INIT_WORK(&c->discard_work, bch2_do_discards_work);
+	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
+}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
new file mode 100644
index 000000000000..73faf99a222a
--- /dev/null
+++ b/fs/bcachefs/alloc_background.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
+#define _BCACHEFS_ALLOC_BACKGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "debug.h"
+#include "super.h"
+
+enum bkey_invalid_flags;
+
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX	96U
+
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
+{
+	struct bch_dev *ca;
+
+	if (!bch2_dev_exists2(c, pos.inode))
+		return false;
+
+	ca = bch_dev_bkey_exists(c, pos.inode);
+	return pos.offset >= ca->mi.first_bucket &&
+		pos.offset < ca->mi.nbuckets;
+}
+
+static inline u64 bucket_to_u64(struct bpos bucket)
+{
+	return (bucket.inode << 48) | bucket.offset;
+}
+
+static inline struct bpos u64_to_bucket(u64 bucket)
+{
+	return POS(bucket >> 48, bucket & ~(~0ULL << 48));
+}
+
+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
+{
+	return a.gen - a.oldest_gen;
+}
+
+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors,
+						   u32 cached_sectors,
+						   u32 stripe,
+						   struct bch_alloc_v4 a,
+						   enum bch_data_type data_type)
+{
+	if (stripe)
+		return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
+	if (dirty_sectors)
+		return data_type;
+	if (cached_sectors)
+		return BCH_DATA_cached;
+	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
+		return BCH_DATA_need_discard;
+	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+		return BCH_DATA_need_gc_gens;
+	return BCH_DATA_free;
+}
+
+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
+						 enum bch_data_type data_type)
+{
+	return __alloc_data_type(a.dirty_sectors, a.cached_sectors,
+				 a.stripe, a, data_type);
+}
+
+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
+{
+	return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
+}
+
+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
+{
+	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+}
+
+#define DATA_TYPES_MOVABLE		\
+	((1U << BCH_DATA_btree)|	\
+	 (1U << BCH_DATA_user)|		\
+	 (1U << BCH_DATA_stripe))
+
+static inline bool data_type_movable(enum bch_data_type type)
+{
+	return (1U << type) & DATA_TYPES_MOVABLE;
+}
+
+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
+					      struct bch_dev *ca)
+{
+	if (!data_type_movable(a.data_type) ||
+	    a.dirty_sectors >= ca->mi.bucket_size)
+		return 0;
+
+	return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+}
+
+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
+{
+	return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
+{
+	pos.offset |= alloc_freespace_genbits(a);
+	return pos;
+}
+
+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
+{
+	unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			BCH_ALLOC_V4_U64s_V0) +
+		BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+		(sizeof(struct bch_backpointer) / sizeof(u64));
+
+	BUG_ON(ret > U8_MAX - BKEY_U64s);
+	return ret;
+}
+
+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
+{
+	set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v));
+}
+
+struct bkey_i_alloc_v4 *
+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
+
+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
+
+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert)
+{
+	const struct bch_alloc_v4 *ret;
+
+	if (unlikely(k.k->type != KEY_TYPE_alloc_v4))
+		goto slowpath;
+
+	ret = bkey_s_c_to_alloc_v4(k).v;
+	if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s)
+		goto slowpath;
+
+	return ret;
+slowpath:
+	__bch2_alloc_to_v4(k, convert);
+	return convert;
+}
+
+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
+
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
+
+int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+void bch2_alloc_v4_swab(struct bkey_s);
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v1_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v2_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v3_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v4_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+	.swab		= bch2_alloc_v4_swab,		\
+	.trans_trigger	= bch2_trans_mark_alloc,	\
+	.atomic_trigger	= bch2_mark_alloc,		\
+	.min_val_size	= 48,				\
+})
+
+int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
+	.key_invalid	= bch2_bucket_gens_invalid,	\
+	.val_to_text	= bch2_bucket_gens_to_text,	\
+})
+
+int bch2_bucket_gens_init(struct bch_fs *);
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_alloc ||
+		k->type == KEY_TYPE_alloc_v2 ||
+		k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *);
+int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_do_discards(struct bch_fs *);
+
+static inline u64 should_invalidate_buckets(struct bch_dev *ca,
+					    struct bch_dev_usage u)
+{
+	u64 want_free = ca->mi.nbuckets >> 7;
+	u64 free = max_t(s64, 0,
+			   u.d[BCH_DATA_free].buckets
+			 + u.d[BCH_DATA_need_discard].buckets
+			 - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
+
+	return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+}
+
+void bch2_do_invalidates(struct bch_fs *);
+
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v +
+			 (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+			  BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+	return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
+int bch2_fs_freespace_init(struct bch_fs *);
+
+void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_fs_allocator_background_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
new file mode 100644
index 000000000000..0e6157982607
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.c
@@ -0,0 +1,1638 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2012 Google, Inc.
+ *
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
+ *
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
+ *
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
+ * in a given filesystem.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "clock.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_write.h"
+#include "journal.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "trace.h"
+
+#include <linux/math64.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans,
+					   struct mutex *lock)
+{
+	if (!mutex_trylock(lock)) {
+		bch2_trans_unlock(trans);
+		mutex_lock(lock);
+	}
+}
+
+const char * const bch2_watermarks[] = {
+#define x(t) #t,
+	BCH_WATERMARKS()
+#undef x
+	NULL
+};
+
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void bch2_reset_alloc_cursors(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		ca->alloc_cursor = 0;
+	rcu_read_unlock();
+}
+
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	ob->hash = *slot;
+	*slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+	open_bucket_idx_t idx = ob - c->open_buckets;
+	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+	while (*slot != idx) {
+		BUG_ON(!*slot);
+		slot = &c->open_buckets[*slot].hash;
+	}
+
+	*slot = ob->hash;
+	ob->hash = 0;
+}
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	if (ob->ec) {
+		ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
+		return;
+	}
+
+	percpu_down_read(&c->mark_lock);
+	spin_lock(&ob->lock);
+
+	ob->valid = false;
+	ob->data_type = 0;
+
+	spin_unlock(&ob->lock);
+	percpu_up_read(&c->mark_lock);
+
+	spin_lock(&c->freelist_lock);
+	bch2_open_bucket_hash_remove(c, ob);
+
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+
+	c->open_buckets_nr_free++;
+	ca->nr_open_buckets--;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *c,
+				  struct open_buckets *obs,
+				  unsigned dev)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->dev == dev && ob->ec)
+			bch2_ec_bucket_cancel(c, ob);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+	ob->data_type = 0;
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
+{
+	BUG_ON(c->open_buckets_partial_nr >=
+	       ARRAY_SIZE(c->open_buckets_partial));
+
+	spin_lock(&c->freelist_lock);
+	ob->on_partial_list = true;
+	c->open_buckets_partial[c->open_buckets_partial_nr++] =
+		ob - c->open_buckets;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+	closure_wake_up(&c->freelist_wait);
+}
+
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
+{
+	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+		u64 b = ca->new_fs_bucket_idx++;
+
+		if (!is_superblock_bucket(ca, b) &&
+		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+			return b;
+	}
+
+	return -1;
+}
+
+static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
+{
+	switch (watermark) {
+	case BCH_WATERMARK_reclaim:
+		return 0;
+	case BCH_WATERMARK_btree:
+	case BCH_WATERMARK_btree_copygc:
+		return OPEN_BUCKETS_COUNT / 4;
+	case BCH_WATERMARK_copygc:
+		return OPEN_BUCKETS_COUNT / 3;
+	default:
+		return OPEN_BUCKETS_COUNT / 2;
+	}
+}
+
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+					      u64 bucket,
+					      enum bch_watermark watermark,
+					      const struct bch_alloc_v4 *a,
+					      struct bucket_alloc_state *s,
+					      struct closure *cl)
+{
+	struct open_bucket *ob;
+
+	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
+		s->skipped_nouse++;
+		return NULL;
+	}
+
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		s->skipped_open++;
+		return NULL;
+	}
+
+	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
+		s->skipped_need_journal_commit++;
+		return NULL;
+	}
+
+	if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
+		s->skipped_nocow++;
+		return NULL;
+	}
+
+	spin_lock(&c->freelist_lock);
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+
+		if (!c->blocked_allocate_open_bucket)
+			c->blocked_allocate_open_bucket = local_clock();
+
+		spin_unlock(&c->freelist_lock);
+		return ERR_PTR(-BCH_ERR_open_buckets_empty);
+	}
+
+	/* Recheck under lock: */
+	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
+		spin_unlock(&c->freelist_lock);
+		s->skipped_open++;
+		return NULL;
+	}
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->dev		= ca->dev_idx;
+	ob->gen		= a->gen;
+	ob->bucket	= bucket;
+	spin_unlock(&ob->lock);
+
+	ca->nr_open_buckets++;
+	bch2_open_bucket_hash_add(c, ob);
+
+	if (c->blocked_allocate_open_bucket) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate_open_bucket],
+			c->blocked_allocate_open_bucket);
+		c->blocked_allocate_open_bucket = 0;
+	}
+
+	if (c->blocked_allocate) {
+		bch2_time_stats_update(
+			&c->times[BCH_TIME_blocked_allocate],
+			c->blocked_allocate);
+		c->blocked_allocate = 0;
+	}
+
+	spin_unlock(&c->freelist_lock);
+	return ob;
+}
+
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+					    enum bch_watermark watermark, u64 free_entry,
+					    struct bucket_alloc_state *s,
+					    struct bkey_s_c freespace_k,
+					    struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	struct open_bucket *ob;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	u64 b = free_entry & ~(~0ULL << 56);
+	unsigned genbits = free_entry >> 56;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+		prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+		       "  freespace key ",
+			ca->mi.first_bucket, ca->mi.nbuckets);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter,
+			       BTREE_ID_alloc, POS(ca->dev_idx, b),
+			       BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret) {
+		ob = ERR_PTR(ret);
+		goto err;
+	}
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+
+	if (a->data_type != BCH_DATA_free) {
+		if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+			ob = NULL;
+			goto err;
+		}
+
+		prt_printf(&buf, "non free bucket in freespace btree\n"
+		       "  freespace key ");
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
+		prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+		       "  freespace key ",
+		       genbits, alloc_freespace_genbits(*a) >> 56);
+		bch2_bkey_val_to_text(&buf, c, freespace_k);
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+		ob = ERR_PTR(-EIO);
+		goto err;
+	}
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		struct bch_backpointer bp;
+		struct bpos bp_pos = POS_MIN;
+
+		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+						&bp_pos, &bp,
+						BTREE_ITER_NOPRESERVE);
+		if (ret) {
+			ob = ERR_PTR(ret);
+			goto err;
+		}
+
+		if (!bkey_eq(bp_pos, POS_MAX)) {
+			/*
+			 * Bucket may have data in it - we don't call
+			 * bc2h_trans_inconnsistent() because fsck hasn't
+			 * finished yet
+			 */
+			ob = NULL;
+			goto err;
+		}
+	}
+
+	ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
+	if (!ob)
+		iter.path->preserve = false;
+err:
+	if (iter.trans && iter.path)
+		set_btree_iter_dontneed(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ob;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_early(struct btree_trans *trans,
+			struct bch_dev *ca,
+			enum bch_watermark watermark,
+			struct bucket_alloc_state *s,
+			struct closure *cl)
+{
+	struct btree_iter iter, citer;
+	struct bkey_s_c k, ck;
+	struct open_bucket *ob = NULL;
+	u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+	u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	/*
+	 * Scan with an uncached iterator to avoid polluting the key cache. An
+	 * uncached iter will return a cached key if one exists, but if not
+	 * there is no other underlying protection for the associated key cache
+	 * slot. To avoid racing bucket allocations, look up the cached key slot
+	 * of any likely allocation candidate before attempting to proceed with
+	 * the allocation. This provides proper exclusion on the associated
+	 * bucket.
+	 */
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
+			   BTREE_ITER_SLOTS, k, ret) {
+		struct bch_alloc_v4 a_convert;
+		const struct bch_alloc_v4 *a;
+
+		if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
+			break;
+
+		if (ca->new_fs_bucket_idx &&
+		    is_superblock_bucket(ca, k.k->p.offset))
+			continue;
+
+		a = bch2_alloc_to_v4(k, &a_convert);
+		if (a->data_type != BCH_DATA_free)
+			continue;
+
+		/* now check the cached key to serialize concurrent allocs of the bucket */
+		ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+		ret = bkey_err(ck);
+		if (ret)
+			break;
+
+		a = bch2_alloc_to_v4(ck, &a_convert);
+		if (a->data_type != BCH_DATA_free)
+			goto next;
+
+		s->buckets_seen++;
+
+		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+		citer.path->preserve = false;
+		bch2_trans_iter_exit(trans, &citer);
+		if (ob)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	alloc_cursor = iter.pos.offset;
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > first_bucket) {
+		alloc_cursor = alloc_start = first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
+						   struct bch_dev *ca,
+						   enum bch_watermark watermark,
+						   struct bucket_alloc_state *s,
+						   struct closure *cl)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct open_bucket *ob = NULL;
+	u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
+	u64 alloc_cursor = alloc_start;
+	int ret;
+
+	BUG_ON(ca->new_fs_bucket_idx);
+again:
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+				     POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
+		if (k.k->p.inode != ca->dev_idx)
+			break;
+
+		for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
+		     alloc_cursor < k.k->p.offset;
+		     alloc_cursor++) {
+			ret = btree_trans_too_many_iters(trans);
+			if (ret) {
+				ob = ERR_PTR(ret);
+				break;
+			}
+
+			s->buckets_seen++;
+
+			ob = try_alloc_bucket(trans, ca, watermark,
+					      alloc_cursor, s, k, cl);
+			if (ob) {
+				iter.path->preserve = false;
+				break;
+			}
+		}
+
+		if (ob || ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	ca->alloc_cursor = alloc_cursor;
+
+	if (!ob && ret)
+		ob = ERR_PTR(ret);
+
+	if (!ob && alloc_start > ca->mi.first_bucket) {
+		alloc_cursor = alloc_start = ca->mi.first_bucket;
+		goto again;
+	}
+
+	return ob;
+}
+
+/**
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:	transaction object
+ * @ca:		device to allocate from
+ * @watermark:	how important is this allocation?
+ * @cl:		if not NULL, closure to be used to wait if buckets not available
+ * @usage:	for secondarily also returning the current device usage
+ *
+ * Returns:	an open_bucket on success, or an ERR_PTR() on failure.
+ */
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+				      struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      struct closure *cl,
+				      struct bch_dev_usage *usage)
+{
+	struct bch_fs *c = trans->c;
+	struct open_bucket *ob = NULL;
+	bool freespace = READ_ONCE(ca->mi.freespace_initialized);
+	u64 avail;
+	struct bucket_alloc_state s = { 0 };
+	bool waiting = false;
+again:
+	bch2_dev_usage_read_fast(ca, usage);
+	avail = dev_buckets_free(ca, *usage, watermark);
+
+	if (usage->d[BCH_DATA_need_discard].buckets > avail)
+		bch2_do_discards(c);
+
+	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+		bch2_do_gc_gens(c);
+
+	if (should_invalidate_buckets(ca, *usage))
+		bch2_do_invalidates(c);
+
+	if (!avail) {
+		if (cl && !waiting) {
+			closure_wait(&c->freelist_wait, cl);
+			waiting = true;
+			goto again;
+		}
+
+		if (!c->blocked_allocate)
+			c->blocked_allocate = local_clock();
+
+		ob = ERR_PTR(-BCH_ERR_freelist_empty);
+		goto err;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc:
+	ob = likely(freespace)
+		? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
+		: bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+
+	if (s.skipped_need_journal_commit * 2 > avail)
+		bch2_journal_flush_async(&c->journal, NULL);
+
+	if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+		freespace = false;
+		goto alloc;
+	}
+err:
+	if (!ob)
+		ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+
+	if (!IS_ERR(ob))
+		trace_and_count(c, bucket_alloc, ca,
+				bch2_watermarks[watermark],
+				ob->bucket,
+				usage->d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				&s,
+				cl == NULL,
+				"");
+	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
+		trace_and_count(c, bucket_alloc_fail, ca,
+				bch2_watermarks[watermark],
+				0,
+				usage->d[BCH_DATA_free].buckets,
+				avail,
+				bch2_copygc_wait_amount(c),
+				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+				&s,
+				cl == NULL,
+				bch2_err_str(PTR_ERR(ob)));
+
+	return ob;
+}
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+				      enum bch_watermark watermark,
+				      struct closure *cl)
+{
+	struct bch_dev_usage usage;
+	struct open_bucket *ob;
+
+	bch2_trans_do(c, NULL, NULL, 0,
+		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
+							cl, &usage)));
+	return ob;
+}
+
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+			    unsigned l, unsigned r)
+{
+	return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+		(stripe->next_alloc[l] < stripe->next_alloc[r]));
+}
+
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+					  struct dev_stripe_state *stripe,
+					  struct bch_devs_mask *devs)
+{
+	struct dev_alloc_list ret = { .nr = 0 };
+	unsigned i;
+
+	for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
+		ret.devs[ret.nr++] = i;
+
+	bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
+	return ret;
+}
+
+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe,
+			       struct bch_dev_usage *usage)
+{
+	u64 *v = stripe->next_alloc + ca->dev_idx;
+	u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+	u64 free_space_inv = free_space
+		? div64_u64(1ULL << 48, free_space)
+		: 1ULL << 48;
+	u64 scale = *v / 4;
+
+	if (*v + free_space_inv >= *v)
+		*v += free_space_inv;
+	else
+		*v = U64_MAX;
+
+	for (v = stripe->next_alloc;
+	     v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+		*v = *v < scale ? 0 : *v - scale;
+}
+
+void bch2_dev_stripe_increment(struct bch_dev *ca,
+			       struct dev_stripe_state *stripe)
+{
+	struct bch_dev_usage usage;
+
+	bch2_dev_usage_read_fast(ca, &usage);
+	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+}
+
+static int add_new_bucket(struct bch_fs *c,
+			   struct open_buckets *ptrs,
+			   struct bch_devs_mask *devs_may_alloc,
+			   unsigned nr_replicas,
+			   unsigned *nr_effective,
+			   bool *have_cache,
+			   unsigned flags,
+			   struct open_bucket *ob)
+{
+	unsigned durability =
+		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+	__clear_bit(ob->dev, devs_may_alloc->d);
+	*nr_effective	+= (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+		? durability : 1;
+	*have_cache	|= !durability;
+
+	ob_push(c, ptrs, ob);
+
+	if (*nr_effective >= nr_replicas)
+		return 1;
+	if (ob->ec)
+		return 1;
+	return 0;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+		      struct open_buckets *ptrs,
+		      struct dev_stripe_state *stripe,
+		      struct bch_devs_mask *devs_may_alloc,
+		      unsigned nr_replicas,
+		      unsigned *nr_effective,
+		      bool *have_cache,
+		      unsigned flags,
+		      enum bch_data_type data_type,
+		      enum bch_watermark watermark,
+		      struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted =
+		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+	unsigned dev;
+	struct bch_dev *ca;
+	int ret = -BCH_ERR_insufficient_devices;
+	unsigned i;
+
+	BUG_ON(*nr_effective >= nr_replicas);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		struct bch_dev_usage usage;
+		struct open_bucket *ob;
+
+		dev = devs_sorted.devs[i];
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability && *have_cache) {
+			percpu_ref_put(&ca->ref);
+			continue;
+		}
+
+		ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage);
+		if (!IS_ERR(ob))
+			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
+		percpu_ref_put(&ca->ref);
+
+		if (IS_ERR(ob)) {
+			ret = PTR_ERR(ob);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl)
+				break;
+			continue;
+		}
+
+		ob->data_type = data_type;
+
+		if (add_new_bucket(c, ptrs, devs_may_alloc,
+				   nr_replicas, nr_effective,
+				   have_cache, flags, ob)) {
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/* Allocate from stripes: */
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static int bucket_alloc_from_stripe(struct btree_trans *trans,
+			 struct open_buckets *ptrs,
+			 struct write_point *wp,
+			 struct bch_devs_mask *devs_may_alloc,
+			 u16 target,
+			 unsigned nr_replicas,
+			 unsigned *nr_effective,
+			 bool *have_cache,
+			 enum bch_watermark watermark,
+			 unsigned flags,
+			 struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct dev_alloc_list devs_sorted;
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i, ec_idx;
+	int ret = 0;
+
+	if (nr_replicas < 2)
+		return 0;
+
+	if (ec_open_bucket(c, ptrs))
+		return 0;
+
+	h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	if (!h)
+		return 0;
+
+	devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+
+	for (i = 0; i < devs_sorted.nr; i++)
+		for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+			if (!h->s->blocks[ec_idx])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[ec_idx];
+			if (ob->dev == devs_sorted.devs[i] &&
+			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+				goto got_bucket;
+		}
+	goto out_put_head;
+got_bucket:
+	ob->ec_idx	= ec_idx;
+	ob->ec		= h->s;
+	ec_stripe_new_get(h->s, STRIPE_REF_io);
+
+	ret = add_new_bucket(c, ptrs, devs_may_alloc,
+			     nr_replicas, nr_effective,
+			     have_cache, flags, ob);
+out_put_head:
+	bch2_ec_stripe_head_put(c, h);
+	return ret;
+}
+
+/* Sector allocator */
+
+static bool want_bucket(struct bch_fs *c,
+			struct write_point *wp,
+			struct bch_devs_mask *devs_may_alloc,
+			bool *have_cache, bool ec,
+			struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	if (!test_bit(ob->dev, devs_may_alloc->d))
+		return false;
+
+	if (ob->data_type != wp->data_type)
+		return false;
+
+	if (!ca->mi.durability &&
+	    (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+		return false;
+
+	if (ec != (ob->ec != NULL))
+		return false;
+
+	return true;
+}
+
+static int bucket_alloc_set_writepoint(struct bch_fs *c,
+				       struct open_buckets *ptrs,
+				       struct write_point *wp,
+				       struct bch_devs_mask *devs_may_alloc,
+				       unsigned nr_replicas,
+				       unsigned *nr_effective,
+				       bool *have_cache,
+				       bool ec, unsigned flags)
+{
+	struct open_buckets ptrs_skip = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+	int ret = 0;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		if (!ret && want_bucket(c, wp, devs_may_alloc,
+					have_cache, ec, ob))
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+				       nr_replicas, nr_effective,
+				       have_cache, flags, ob);
+		else
+			ob_push(c, &ptrs_skip, ob);
+	}
+	wp->ptrs = ptrs_skip;
+
+	return ret;
+}
+
+static int bucket_alloc_set_partial(struct bch_fs *c,
+				    struct open_buckets *ptrs,
+				    struct write_point *wp,
+				    struct bch_devs_mask *devs_may_alloc,
+				    unsigned nr_replicas,
+				    unsigned *nr_effective,
+				    bool *have_cache, bool ec,
+				    enum bch_watermark watermark,
+				    unsigned flags)
+{
+	int i, ret = 0;
+
+	if (!c->open_buckets_partial_nr)
+		return 0;
+
+	spin_lock(&c->freelist_lock);
+
+	if (!c->open_buckets_partial_nr)
+		goto unlock;
+
+	for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
+		struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
+
+		if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+			struct bch_dev_usage usage;
+			u64 avail;
+
+			bch2_dev_usage_read_fast(ca, &usage);
+			avail = dev_buckets_free(ca, usage, watermark);
+			if (!avail)
+				continue;
+
+			array_remove_item(c->open_buckets_partial,
+					  c->open_buckets_partial_nr,
+					  i);
+			ob->on_partial_list = false;
+
+			ret = add_new_bucket(c, ptrs, devs_may_alloc,
+					     nr_replicas, nr_effective,
+					     have_cache, flags, ob);
+			if (ret)
+				break;
+		}
+	}
+unlock:
+	spin_unlock(&c->freelist_lock);
+	return ret;
+}
+
+static int __open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			bool erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			unsigned flags,
+			struct closure *_cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs;
+	struct open_bucket *ob;
+	struct closure *cl = NULL;
+	unsigned i;
+	int ret;
+
+	devs = target_rw_devs(c, wp->data_type, target);
+
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		__clear_bit(ob->dev, devs.d);
+
+	if (erasure_code && ec_open_bucket(c, ptrs))
+		return 0;
+
+	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, flags);
+	if (ret)
+		return ret;
+
+	ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
+				 nr_replicas, nr_effective,
+				 have_cache, erasure_code, watermark, flags);
+	if (ret)
+		return ret;
+
+	if (erasure_code) {
+		ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
+					 target,
+					 nr_replicas, nr_effective,
+					 have_cache,
+					 watermark, flags, _cl);
+	} else {
+retry_blocking:
+		/*
+		 * Try nonblocking first, so that if one device is full we'll try from
+		 * other devices:
+		 */
+		ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
+					nr_replicas, nr_effective, have_cache,
+					flags, wp->data_type, watermark, cl);
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+		    !bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
+		    !cl && _cl) {
+			cl = _cl;
+			goto retry_blocking;
+		}
+	}
+
+	return ret;
+}
+
+static int open_bucket_add_buckets(struct btree_trans *trans,
+			struct open_buckets *ptrs,
+			struct write_point *wp,
+			struct bch_devs_list *devs_have,
+			u16 target,
+			unsigned erasure_code,
+			unsigned nr_replicas,
+			unsigned *nr_effective,
+			bool *have_cache,
+			enum bch_watermark watermark,
+			unsigned flags,
+			struct closure *cl)
+{
+	int ret;
+
+	if (erasure_code) {
+		ret = __open_bucket_add_buckets(trans, ptrs, wp,
+				devs_have, target, erasure_code,
+				nr_replicas, nr_effective, have_cache,
+				watermark, flags, cl);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+		    bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
+		    bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
+		    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			return ret;
+		if (*nr_effective >= nr_replicas)
+			return 0;
+	}
+
+	ret = __open_bucket_add_buckets(trans, ptrs, wp,
+			devs_have, target, false,
+			nr_replicas, nr_effective, have_cache,
+			watermark, flags, cl);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:		open_bucket to predicate on
+ * @c:		filesystem handle
+ * @ca:		if set, we're killing buckets for a particular device
+ * @ec:		if true, we're shutting down erasure coding and killing all ec
+ *		open_buckets
+ *		otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+			       struct bch_dev *ca, bool ec)
+{
+	if (ec) {
+		return ob->ec != NULL;
+	} else if (ca) {
+		bool drop = ob->dev == ca->dev_idx;
+		struct open_bucket *ob2;
+		unsigned i;
+
+		if (!drop && ob->ec) {
+			unsigned nr_blocks;
+
+			mutex_lock(&ob->ec->lock);
+			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
+
+			for (i = 0; i < nr_blocks; i++) {
+				if (!ob->ec->blocks[i])
+					continue;
+
+				ob2 = c->open_buckets + ob->ec->blocks[i];
+				drop |= ob2->dev == ca->dev_idx;
+			}
+			mutex_unlock(&ob->ec->lock);
+		}
+
+		return drop;
+	} else {
+		return true;
+	}
+}
+
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+				 bool ec, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (should_drop_bucket(ob, c, ca, ec))
+			bch2_open_bucket_put(c, ob);
+		else
+			ob_push(c, &ptrs, ob);
+	wp->ptrs = ptrs;
+	mutex_unlock(&wp->lock);
+}
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+			    bool ec)
+{
+	unsigned i;
+
+	/* Next, close write points that point to this device... */
+	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+		bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+	bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+	bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	while (c->btree_reserve_cache_nr) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		bch2_open_buckets_put(c, &a->ob);
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+	spin_lock(&c->freelist_lock);
+	i = 0;
+	while (i < c->open_buckets_partial_nr) {
+		struct open_bucket *ob =
+			c->open_buckets + c->open_buckets_partial[i];
+
+		if (should_drop_bucket(ob, c, ca, ec)) {
+			--c->open_buckets_partial_nr;
+			swap(c->open_buckets_partial[i],
+			     c->open_buckets_partial[c->open_buckets_partial_nr]);
+			ob->on_partial_list = false;
+			spin_unlock(&c->freelist_lock);
+			bch2_open_bucket_put(c, ob);
+			spin_lock(&c->freelist_lock);
+		} else {
+			i++;
+		}
+	}
+	spin_unlock(&c->freelist_lock);
+
+	bch2_ec_stop_dev(c, ca);
+}
+
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+						 unsigned long write_point)
+{
+	unsigned hash =
+		hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+	return &c->write_points_hash[hash];
+}
+
+static struct write_point *__writepoint_find(struct hlist_head *head,
+					     unsigned long write_point)
+{
+	struct write_point *wp;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(wp, head, node)
+		if (wp->write_point == write_point)
+			goto out;
+	wp = NULL;
+out:
+	rcu_read_unlock();
+	return wp;
+}
+
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+	u64 stranded	= c->write_points_nr * c->bucket_size_max;
+	u64 free	= bch2_fs_usage_read_short(c).free;
+
+	return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+	    too_many_writepoints(c, 32))
+		return false;
+
+	wp = c->write_points + c->write_points_nr++;
+	hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+	return true;
+}
+
+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->write_points_hash_lock);
+	if (c->write_points_nr < old_nr) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return true;
+	}
+
+	if (c->write_points_nr == 1 ||
+	    !too_many_writepoints(c, 8)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		return false;
+	}
+
+	wp = c->write_points + --c->write_points_nr;
+
+	hlist_del_rcu(&wp->node);
+	mutex_unlock(&c->write_points_hash_lock);
+
+	bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+	wp->ptrs.nr = 0;
+	mutex_unlock(&wp->lock);
+	return true;
+}
+
+static struct write_point *writepoint_find(struct btree_trans *trans,
+					   unsigned long write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp, *oldest;
+	struct hlist_head *head;
+
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		return wp;
+	}
+
+	head = writepoint_hash(c, write_point);
+restart_find:
+	wp = __writepoint_find(head, write_point);
+	if (wp) {
+lock_wp:
+		bch2_trans_mutex_lock_norelock(trans, &wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
+restart_find_oldest:
+	oldest = NULL;
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++)
+		if (!oldest || time_before64(wp->last_used, oldest->last_used))
+			oldest = wp;
+
+	bch2_trans_mutex_lock_norelock(trans, &oldest->lock);
+	bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock);
+	if (oldest >= c->write_points + c->write_points_nr ||
+	    try_increase_writepoints(c)) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto restart_find_oldest;
+	}
+
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
+
+	wp = oldest;
+	hlist_del_rcu(&wp->node);
+	wp->write_point = write_point;
+	hlist_add_head_rcu(&wp->node, head);
+	mutex_unlock(&c->write_points_hash_lock);
+out:
+	wp->last_used = local_clock();
+	return wp;
+}
+
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct open_buckets *ptrs_no_use,
+			  unsigned extra_replicas)
+{
+	struct open_buckets ptrs2 = { 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i) {
+		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+		if (d && d <= extra_replicas) {
+			extra_replicas -= d;
+			ob_push(c, ptrs_no_use, ob);
+		} else {
+			ob_push(c, &ptrs2, ob);
+		}
+	}
+
+	*ptrs = ptrs2;
+}
+
+/*
+ * Get us an open_bucket we can allocate from, return with it locked:
+ */
+int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
+			     unsigned target,
+			     unsigned erasure_code,
+			     struct write_point_specifier write_point,
+			     struct bch_devs_list *devs_have,
+			     unsigned nr_replicas,
+			     unsigned nr_replicas_required,
+			     enum bch_watermark watermark,
+			     unsigned flags,
+			     struct closure *cl,
+			     struct write_point **wp_ret)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct open_bucket *ob;
+	struct open_buckets ptrs;
+	unsigned nr_effective, write_points_nr;
+	bool have_cache;
+	int ret;
+	int i;
+
+	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+		erasure_code = false;
+
+	BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
+	BUG_ON(!nr_replicas || !nr_replicas_required);
+retry:
+	ptrs.nr		= 0;
+	nr_effective	= 0;
+	write_points_nr = c->write_points_nr;
+	have_cache	= false;
+
+	*wp_ret = wp = writepoint_find(trans, write_point.v);
+
+	/* metadata may not allocate on cache devices: */
+	if (wp->data_type != BCH_DATA_user)
+		have_cache = true;
+
+	if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, NULL);
+		if (!ret ||
+		    bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto alloc_done;
+
+		/* Don't retry from all devices if we're out of open buckets: */
+		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
+			int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+			if (!ret ||
+			    bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+				goto alloc_done;
+		}
+
+		/*
+		 * Only try to allocate cache (durability = 0 devices) from the
+		 * specified target:
+		 */
+		have_cache = true;
+
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      0, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	} else {
+		ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+					      target, erasure_code,
+					      nr_replicas, &nr_effective,
+					      &have_cache, watermark,
+					      flags, cl);
+	}
+alloc_done:
+	BUG_ON(!ret && nr_effective < nr_replicas);
+
+	if (erasure_code && !ec_open_bucket(c, &ptrs))
+		pr_debug("failed to get ec bucket: ret %u", ret);
+
+	if (ret == -BCH_ERR_insufficient_devices &&
+	    nr_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
+	if (nr_effective > nr_replicas)
+		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
+	/* Free buckets we didn't use: */
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		open_bucket_free_unused(c, ob);
+
+	wp->ptrs = ptrs;
+
+	wp->sectors_free = UINT_MAX;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+
+	return 0;
+err:
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
+			ob_push(c, &ptrs, ob);
+		else
+			open_bucket_free_unused(c, ob);
+	wp->ptrs = ptrs;
+
+	mutex_unlock(&wp->lock);
+
+	if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
+	    try_decrease_writepoints(trans, write_points_nr))
+		goto retry;
+
+	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
+		return cl
+			? -BCH_ERR_bucket_alloc_blocked
+			: -BCH_ERR_ENOSPC_bucket_alloc;
+
+	return ret;
+}
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+	return (struct bch_extent_ptr) {
+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.gen	= ob->gen,
+		.dev	= ob->dev,
+		.offset	= bucket_to_sector(ca, ob->bucket) +
+			ca->mi.bucket_size -
+			ob->sectors_free,
+	};
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i *k, unsigned sectors,
+				    bool cached)
+{
+	bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached);
+}
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
+{
+	bch2_alloc_sectors_done_inlined(c, wp);
+}
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->data_type = type;
+
+	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
+	INIT_LIST_HEAD(&wp->writes);
+	spin_lock_init(&wp->writes_lock);
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+	struct write_point *wp;
+
+	mutex_init(&c->write_points_hash_lock);
+	c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+	/* open bucket 0 is a sentinal NULL: */
+	spin_lock_init(&c->open_buckets[0].lock);
+
+	for (ob = c->open_buckets + 1;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+		spin_lock_init(&ob->lock);
+		c->open_buckets_nr_free++;
+
+		ob->freelist = c->open_buckets_freelist;
+		c->open_buckets_freelist = ob - c->open_buckets;
+	}
+
+	writepoint_init(&c->btree_write_point,		BCH_DATA_btree);
+	writepoint_init(&c->rebalance_write_point,	BCH_DATA_user);
+	writepoint_init(&c->copygc_write_point,		BCH_DATA_user);
+
+	for (wp = c->write_points;
+	     wp < c->write_points + c->write_points_nr; wp++) {
+		writepoint_init(wp, BCH_DATA_user);
+
+		wp->last_used	= local_clock();
+		wp->write_point	= (unsigned long) wp;
+		hlist_add_head_rcu(&wp->node,
+				   writepoint_hash(c, wp->write_point));
+	}
+}
+
+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned data_type = ob->data_type;
+	barrier(); /* READ_ONCE() doesn't work on bitfields */
+
+	prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+		   ob - c->open_buckets,
+		   atomic_read(&ob->pin),
+		   data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+		   ob->dev, ob->bucket, ob->gen,
+		   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
+	if (ob->ec)
+		prt_printf(out, " ec idx %llu", ob->ec->idx);
+	if (ob->on_partial_list)
+		prt_str(out, " partial");
+	prt_newline(out);
+}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	out->atomic++;
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list)
+			bch2_open_bucket_to_text(out, c, ob);
+		spin_unlock(&ob->lock);
+	}
+
+	--out->atomic;
+}
+
+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&c->freelist_lock);
+
+	for (i = 0; i < c->open_buckets_partial_nr; i++)
+		bch2_open_bucket_to_text(out, c,
+				c->open_buckets + c->open_buckets_partial[i]);
+
+	spin_unlock(&c->freelist_lock);
+	--out->atomic;
+}
+
+static const char * const bch2_write_point_states[] = {
+#define x(n)	#n,
+	WRITE_POINT_STATES()
+#undef x
+	NULL
+};
+
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+				     struct write_point *wp)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	prt_printf(out, "%lu: ", wp->write_point);
+	prt_human_readable_u64(out, wp->sectors_allocated);
+
+	prt_printf(out, " last wrote: ");
+	bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+		prt_printf(out, " %s: ", bch2_write_point_states[i]);
+		bch2_pr_time_units(out, wp->time[i]);
+	}
+
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		bch2_open_bucket_to_text(out, c, ob);
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct write_point *wp;
+
+	prt_str(out, "Foreground write points\n");
+	for (wp = c->write_points;
+	     wp < c->write_points + ARRAY_SIZE(c->write_points);
+	     wp++)
+		bch2_write_point_to_text(out, c, wp);
+
+	prt_str(out, "Copygc write point\n");
+	bch2_write_point_to_text(out, c, &c->copygc_write_point);
+
+	prt_str(out, "Rebalance write point\n");
+	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
+
+	prt_str(out, "Btree write point\n");
+	bch2_write_point_to_text(out, c, &c->btree_write_point);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
new file mode 100644
index 000000000000..7aaeec44c746
--- /dev/null
+++ b/fs/bcachefs/alloc_foreground.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
+#define _BCACHEFS_ALLOC_FOREGROUND_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+#include <linux/hash.h>
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+extern const char * const bch2_watermarks[];
+
+void bch2_reset_alloc_cursors(struct bch_fs *);
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+					  struct dev_stripe_state *,
+					  struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
+				      enum bch_watermark, struct closure *);
+
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
+			   struct open_bucket *ob)
+{
+	BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
+
+	obs->v[obs->nr++] = ob - c->open_buckets;
+}
+
+#define open_bucket_for_each(_c, _obs, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_obs)->nr &&					\
+	     ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);	\
+	     (_i)++)
+
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+						 struct open_buckets *obs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, obs, ob, i)
+		if (ob->ec)
+			return ob;
+
+	return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+			struct open_buckets *, unsigned);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_buckets_put(struct bch_fs *c,
+					 struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i)
+		bch2_open_bucket_put(c, ob);
+	ptrs->nr = 0;
+}
+
+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i)
+		ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+	wp->ptrs = keep;
+
+	mutex_unlock(&wp->lock);
+
+	bch2_open_buckets_put(c, &ptrs);
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					struct open_buckets *ptrs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		ob->data_type = wp->data_type;
+		atomic_inc(&ob->pin);
+		ob_push(c, ptrs, ob);
+	}
+}
+
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+						  unsigned dev, u64 bucket)
+{
+	return c->open_buckets_hash +
+		(jhash_3words(dev, bucket, bucket >> 32, 0) &
+		 (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+	while (slot) {
+		struct open_bucket *ob = &c->open_buckets[slot];
+
+		if (ob->dev == dev && ob->bucket == bucket)
+			return true;
+
+		slot = ob->hash;
+	}
+
+	return false;
+}
+
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+	bool ret;
+
+	if (bch2_bucket_is_open(c, dev, bucket))
+		return true;
+
+	spin_lock(&c->freelist_lock);
+	ret = bch2_bucket_is_open(c, dev, bucket);
+	spin_unlock(&c->freelist_lock);
+
+	return ret;
+}
+
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
+		      struct dev_stripe_state *, struct bch_devs_mask *,
+		      unsigned, unsigned *, bool *, unsigned,
+		      enum bch_data_type, enum bch_watermark,
+		      struct closure *);
+
+int bch2_alloc_sectors_start_trans(struct btree_trans *,
+				   unsigned, unsigned,
+				   struct write_point_specifier,
+				   struct bch_devs_list *,
+				   unsigned, unsigned,
+				   enum bch_watermark,
+				   unsigned,
+				   struct closure *,
+				   struct write_point **);
+
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+
+/*
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
+ * as allocated out of @ob
+ */
+static inline void
+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp,
+				       struct bkey_i *k, unsigned sectors,
+				       bool cached)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free	-= sectors;
+	wp->sectors_allocated	+= sectors;
+
+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
+
+		ptr.cached = cached ||
+			(!ca->mi.durability &&
+			 wp->data_type == BCH_DATA_user);
+
+		bch2_bkey_append_ptr(k, ptr);
+
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
+	}
+}
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i *, unsigned, bool);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
new file mode 100644
index 000000000000..b91b7a461056
--- /dev/null
+++ b/fs/bcachefs/alloc_types.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+struct bucket_alloc_state {
+	u64	buckets_seen;
+	u64	skipped_open;
+	u64	skipped_need_journal_commit;
+	u64	skipped_nocow;
+	u64	skipped_nouse;
+};
+
+#define BCH_WATERMARKS()		\
+	x(stripe)			\
+	x(normal)			\
+	x(copygc)			\
+	x(btree)			\
+	x(btree_copygc)			\
+	x(reclaim)
+
+enum bch_watermark {
+#define x(name)	BCH_WATERMARK_##name,
+	BCH_WATERMARKS()
+#undef x
+	BCH_WATERMARK_NR,
+};
+
+#define BCH_WATERMARK_BITS	3
+#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
+
+#define OPEN_BUCKETS_COUNT	1024
+
+#define WRITE_POINT_HASH_NR	32
+#define WRITE_POINT_MAX		32
+
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
+typedef u16			open_bucket_idx_t;
+
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	open_bucket_idx_t	freelist;
+	open_bucket_idx_t	hash;
+
+	/*
+	 * When an open bucket has an ec_stripe attached, this is the index of
+	 * the block in the stripe this open_bucket corresponds to:
+	 */
+	u8			ec_idx;
+	enum bch_data_type	data_type:6;
+	unsigned		valid:1;
+	unsigned		on_partial_list:1;
+
+	u8			dev;
+	u8			gen;
+	u32			sectors_free;
+	u64			bucket;
+	struct ec_stripe_new	*ec;
+};
+
+#define OPEN_BUCKET_LIST_MAX	15
+
+struct open_buckets {
+	open_bucket_idx_t	nr;
+	open_bucket_idx_t	v[OPEN_BUCKET_LIST_MAX];
+};
+
+struct dev_stripe_state {
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+#define WRITE_POINT_STATES()		\
+	x(stopped)			\
+	x(waiting_io)			\
+	x(waiting_work)			\
+	x(running)
+
+enum write_point_state {
+#define x(n)	WRITE_POINT_##n,
+	WRITE_POINT_STATES()
+#undef x
+	WRITE_POINT_STATE_NR
+};
+
+struct write_point {
+	struct {
+		struct hlist_node	node;
+		struct mutex		lock;
+		u64			last_used;
+		unsigned long		write_point;
+		enum bch_data_type	data_type;
+
+		/* calculated based on how many pointers we're actually going to use: */
+		unsigned		sectors_free;
+
+		struct open_buckets	ptrs;
+		struct dev_stripe_state	stripe;
+
+		u64			sectors_allocated;
+	} __aligned(SMP_CACHE_BYTES);
+
+	struct {
+		struct work_struct	index_update_work;
+
+		struct list_head	writes;
+		spinlock_t		writes_lock;
+
+		enum write_point_state	state;
+		u64			last_state_change;
+		u64			time[WRITE_POINT_STATE_NR];
+	} __aligned(SMP_CACHE_BYTES);
+};
+
+struct write_point_specifier {
+	unsigned long		v;
+};
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644
index 000000000000..23c0834a97a4
--- /dev/null
+++ b/fs/bcachefs/backpointers.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+
+#include <linux/mm.h>
+
+static bool extent_matches_bp(struct bch_fs *c,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k,
+			      struct bpos bucket,
+			      struct bch_backpointer bp)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket2;
+		struct bch_backpointer bp2;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+				      &bucket2, &bp2);
+		if (bpos_eq(bucket, bucket2) &&
+		    !memcmp(&bp, &bp2, sizeof(bp)))
+			return true;
+	}
+
+	return false;
+}
+
+int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+	int ret = 0;
+
+	bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+			 c, err,
+			 backpointer_pos_wrong,
+			 "backpointer at wrong pos");
+fsck_err:
+	return ret;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+	prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+	       bch2_btree_id_str(bp->btree_id),
+	       bp->level,
+	       (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+	       bp->bucket_len);
+	bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	prt_str(out, "bucket=");
+	bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+	prt_str(out, " ");
+
+	bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+	struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+	bp.v->bucket_offset	= swab40(bp.v->bucket_offset);
+	bp.v->bucket_len	= swab32(bp.v->bucket_len);
+	bch2_bpos_swab(&bp.v->pos);
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+					struct bch_backpointer bp,
+					struct bkey_s_c bp_k,
+					struct bkey_s_c orig_k,
+					bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	if (insert) {
+		prt_printf(&buf, "existing backpointer found when inserting ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "found ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	} else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		prt_printf(&buf, "backpointer not found when deleting");
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		prt_printf(&buf, "searching for ");
+		bch2_backpointer_to_text(&buf, &bp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "got ");
+		bch2_bkey_val_to_text(&buf, c, bp_k);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "for ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+
+		bch_err(c, "%s", buf.buf);
+	}
+
+	printbuf_exit(&buf);
+
+	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+		bch2_inconsistent_error(c);
+		return -EIO;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+				struct bkey_i_backpointer *bp_k,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct btree_iter bp_iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+			       bp_k->k.p,
+			       BTREE_ITER_INTENT|
+			       BTREE_ITER_SLOTS|
+			       BTREE_ITER_WITH_UPDATES);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (insert
+	    ? k.k->type
+	    : (k.k->type != KEY_TYPE_backpointer ||
+	       memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+		ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+			      struct bpos bucket, int gen,
+			      struct bpos *bp_pos,
+			      struct bch_backpointer *bp,
+			      unsigned iter_flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+	struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL };
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (bpos_ge(*bp_pos, bp_end_pos))
+		goto done;
+
+	if (gen >= 0) {
+		k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				       bucket, BTREE_ITER_CACHED|iter_flags);
+		ret = bkey_err(k);
+		if (ret)
+			goto out;
+
+		if (k.k->type != KEY_TYPE_alloc_v4 ||
+		    bkey_s_c_to_alloc_v4(k).v->gen != gen)
+			goto done;
+	}
+
+	*bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0));
+
+	for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+				     *bp_pos, iter_flags, k, ret) {
+		if (bpos_ge(k.k->p, bp_end_pos))
+			break;
+
+		*bp_pos = k.k->p;
+		*bp = *bkey_s_c_to_backpointer(k).v;
+		goto out;
+	}
+done:
+	*bp_pos = SPOS_MAX;
+out:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+				  struct bpos bp_pos,
+				  struct bch_backpointer bp,
+				  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+
+	/*
+	 * If we're using the btree write buffer, the backpointer we were
+	 * looking at may have already been deleted - failure to find what it
+	 * pointed to is not an error:
+	 */
+	if (likely(!bch2_backpointers_no_use_write_buffer))
+		return;
+
+	prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+		   bp.level ? "btree node" : "extent");
+	prt_printf(&buf, "bucket: ");
+	bch2_bpos_to_text(&buf, bucket);
+	prt_printf(&buf, "\n  ");
+
+	prt_printf(&buf, "backpointer pos: ");
+	bch2_bpos_to_text(&buf, bp_pos);
+	prt_printf(&buf, "\n  ");
+
+	bch2_backpointer_to_text(&buf, &bp);
+	prt_printf(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k);
+	if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers)
+		bch_err_ratelimited(c, "%s", buf.buf);
+	else
+		bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+	printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bpos bp_pos,
+					 struct bch_backpointer bp,
+					 unsigned iter_flags)
+{
+	if (likely(!bp.level)) {
+		struct bch_fs *c = trans->c;
+		struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+		struct bkey_s_c k;
+
+		bch2_trans_node_iter_init(trans, iter,
+					  bp.btree_id,
+					  bp.pos,
+					  0, 0,
+					  iter_flags);
+		k = bch2_btree_iter_peek_slot(iter);
+		if (bkey_err(k)) {
+			bch2_trans_iter_exit(trans, iter);
+			return k;
+		}
+
+		if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+			return k;
+
+		bch2_trans_iter_exit(trans, iter);
+		backpointer_not_found(trans, bp_pos, bp, k);
+		return bkey_s_c_null;
+	} else {
+		struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+
+		if (IS_ERR_OR_NULL(b)) {
+			bch2_trans_iter_exit(trans, iter);
+			return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
+		}
+		return bkey_i_to_s_c(&b->key);
+	}
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos bp_pos,
+					struct bch_backpointer bp)
+{
+	struct bch_fs *c = trans->c;
+	struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+	struct btree *b;
+
+	BUG_ON(!bp.level);
+
+	bch2_trans_node_iter_init(trans, iter,
+				  bp.btree_id,
+				  bp.pos,
+				  0,
+				  bp.level - 1,
+				  0);
+	b = bch2_btree_iter_peek_node(iter);
+	if (IS_ERR_OR_NULL(b))
+		goto err;
+
+	BUG_ON(b->c.level != bp.level - 1);
+
+	if (extent_matches_bp(c, bp.btree_id, bp.level,
+			      bkey_i_to_s_c(&b->key),
+			      bucket, bp))
+		return b;
+
+	if (btree_node_will_make_reachable(b)) {
+		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+	} else {
+		backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
+		b = NULL;
+	}
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+					struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter alloc_iter = { NULL };
+	struct bkey_s_c alloc_k;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+			backpointer_to_missing_device,
+			"backpointer for missing device:\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+
+	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
+				     bp_pos_to_bucket(c, k.k->p), 0);
+	ret = bkey_err(alloc_k);
+	if (ret)
+		goto out;
+
+	if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+			backpointer_to_missing_alloc,
+			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+			alloc_iter.pos.inode, alloc_iter.pos.offset,
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, bp_iter, 0);
+		goto out;
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_backpointers, POS_MIN, 0, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		  bch2_check_btree_backpointer(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+struct bpos_level {
+	unsigned	level;
+	struct bpos	pos;
+};
+
+static int check_bp_exists(struct btree_trans *trans,
+			   struct bpos bucket,
+			   struct bch_backpointer bp,
+			   struct bkey_s_c orig_k,
+			   struct bpos bucket_start,
+			   struct bpos bucket_end,
+			   struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter bp_iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c bp_k;
+	int ret;
+
+	if (bpos_lt(bucket, bucket_start) ||
+	    bpos_gt(bucket, bucket_end))
+		return 0;
+
+	if (!bch2_dev_bucket_exists(c, bucket))
+		goto missing;
+
+	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp(c, bucket, bp.bucket_offset),
+				  0);
+	ret = bkey_err(bp_k);
+	if (ret)
+		goto err;
+
+	if (bp_k.k->type != KEY_TYPE_backpointer ||
+	    memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
+		if (last_flushed->level != bp.level ||
+		    !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+			last_flushed->level = bp.level;
+			last_flushed->pos = orig_k.k->p;
+
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
+		goto missing;
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	return ret;
+missing:
+	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+	       bch2_btree_id_str(bp.btree_id), bp.level);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_printf(&buf, "\nbp pos ");
+	bch2_bpos_to_text(&buf, bp_iter.pos);
+
+	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
+	    c->opts.reconstruct_alloc ||
+	    fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
+
+	goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos bucket_start,
+					struct bpos bucket_end,
+					struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_btree_iter_peek_all_levels(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+					    enum btree_id btree_id,
+					    struct bpos bucket_start,
+					    struct bpos bucket_end,
+					    struct bpos_level *last_flushed)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_root *r = bch2_btree_id_root(c, btree_id);
+	struct btree_iter iter;
+	struct btree *b;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto err;
+
+	BUG_ON(b != btree_node_root(c, b));
+
+	k = bkey_i_to_s_c(&b->key);
+	ptrs = bch2_bkey_ptrs_c(k);
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		struct bpos bucket_pos;
+		struct bch_backpointer bp;
+
+		if (p.ptr.cached)
+			continue;
+
+		bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
+				      k, p, &bucket_pos, &bp);
+
+		ret = check_bp_exists(trans, bucket_pos, bp, k,
+				      bucket_start, bucket_end,
+				      last_flushed);
+		if (ret)
+			goto err;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+	return (struct bbpos) {
+		.btree	= bp.btree_id,
+		.pos	= bp.pos,
+	};
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	struct sysinfo i;
+	u64 mem_bytes;
+
+	si_meminfo(&i);
+	mem_bytes = i.totalram * i.mem_unit;
+	return div_u64(mem_bytes >> 1, btree_bytes(c));
+}
+
+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
+					unsigned btree_leaf_mask,
+					unsigned btree_interior_mask,
+					struct bbpos start, struct bbpos *end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	enum btree_id btree;
+	int ret = 0;
+
+	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
+		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+
+		if (!((1U << btree) & btree_leaf_mask) &&
+		    !((1U << btree) & btree_interior_mask))
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, btree,
+					  btree == start.btree ? start.pos : POS_MIN,
+					  0, depth, 0);
+		/*
+		 * for_each_btree_key_contineu() doesn't check the return value
+		 * from bch2_btree_iter_advance(), which is needed when
+		 * iterating over interior nodes where we'll see keys at
+		 * SPOS_MAX:
+		 */
+		do {
+			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
+			ret = bkey_err(k);
+			if (!k.k || ret)
+				break;
+
+			--btree_nodes;
+			if (!btree_nodes) {
+				*end = BBPOS(btree, k.k->p);
+				bch2_trans_iter_exit(trans, &iter);
+				return 0;
+			}
+		} while (bch2_btree_iter_advance(&iter));
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	*end = BBPOS_MAX;
+	return ret;
+}
+
+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
+						   struct bpos bucket_start,
+						   struct bpos bucket_end)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	enum btree_id btree_id;
+	struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
+	int ret = 0;
+
+	for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+		bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+					  depth,
+					  BTREE_ITER_ALL_LEVELS|
+					  BTREE_ITER_PREFETCH);
+
+		do {
+			ret = commit_do(trans, NULL, NULL,
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_NOFAIL,
+					check_extent_to_backpointers(trans, &iter,
+								bucket_start, bucket_end,
+								&last_flushed));
+			if (ret)
+				break;
+		} while (!bch2_btree_iter_advance(&iter));
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret)
+			break;
+
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_LAZY_RW|
+				BTREE_INSERT_NOFAIL,
+				check_btree_root_to_backpointers(trans, btree_id,
+							bucket_start, bucket_end,
+							&last_flushed));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
+					 struct bpos bucket)
+{
+	return bch2_dev_exists2(c, bucket.inode)
+		? bucket_pos_to_bp(c, bucket, 0)
+		: bucket;
+}
+
+static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
+					struct bpos start, struct bpos *end)
+{
+	struct btree_iter alloc_iter;
+	struct btree_iter bp_iter;
+	struct bkey_s_c alloc_k, bp_k;
+	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
+	bool alloc_end = false, bp_end = false;
+	int ret = 0;
+
+	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+				  start, 0, 1, 0);
+	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+				  bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
+	while (1) {
+		alloc_k = !alloc_end
+			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
+			: bkey_s_c_null;
+		bp_k = !bp_end
+			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
+			: bkey_s_c_null;
+
+		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
+		if ((!alloc_k.k && !bp_k.k) || ret) {
+			*end = SPOS_MAX;
+			break;
+		}
+
+		--btree_nodes;
+		if (!btree_nodes) {
+			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
+			break;
+		}
+
+		if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
+		    bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
+			if (!bch2_btree_iter_advance(&alloc_iter))
+				alloc_end = true;
+		} else {
+			if (!bch2_btree_iter_advance(&bp_iter))
+				bp_end = true;
+		}
+	}
+	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &alloc_iter);
+	return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bpos start = POS_MIN, end;
+	int ret;
+
+	while (1) {
+		ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+		if (ret)
+			break;
+
+		if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_extents_to_backpointers(): ");
+			bch2_bpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
+		if (ret || bpos_eq(end, SPOS_MAX))
+			break;
+
+		start = bpos_successor(end);
+	}
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+				 struct bbpos start,
+				 struct bbpos end,
+				 struct bkey_s_c_backpointer bp,
+				 struct bpos *last_flushed_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bbpos pos = bp_to_bbpos(*bp.v);
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	if (bbpos_cmp(pos, start) < 0 ||
+	    bbpos_cmp(pos, end) > 0)
+		return 0;
+
+	k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0);
+	ret = bkey_err(k);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		return 0;
+	if (ret)
+		return ret;
+
+	if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
+		*last_flushed_pos = bp.k->p;
+		ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+			-BCH_ERR_transaction_restart_write_buffer_flush;
+		goto out;
+	}
+
+	if (fsck_err_on(!k.k, c,
+			backpointer_to_missing_ptr,
+			"backpointer for missing %s\n  %s",
+			bp.v->level ? "btree node" : "extent",
+			(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+		goto out;
+	}
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
+						   struct bbpos start,
+						   struct bbpos end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos last_flushed_pos = SPOS_MAX;
+
+	return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+				  POS_MIN, BTREE_ITER_PREFETCH, k,
+				  NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_one_backpointer(trans, start, end,
+				      bkey_s_c_to_backpointer(k),
+				      &last_flushed_pos));
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
+	int ret;
+
+	while (1) {
+		ret = bch2_get_btree_in_memory_pos(trans,
+						   (1U << BTREE_ID_extents)|
+						   (1U << BTREE_ID_reflink),
+						   ~0,
+						   start, &end);
+		if (ret)
+			break;
+
+		if (!bbpos_cmp(start, BBPOS_MIN) &&
+		    bbpos_cmp(end, BBPOS_MAX))
+			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
+				    __func__, btree_nodes_fit_in_ram(c));
+
+		if (bbpos_cmp(start, BBPOS_MIN) ||
+		    bbpos_cmp(end, BBPOS_MAX)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "check_backpointers_to_extents(): ");
+			bch2_bbpos_to_text(&buf, start);
+			prt_str(&buf, "-");
+			bch2_bbpos_to_text(&buf, end);
+
+			bch_verbose(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
+		if (ret || !bbpos_cmp(end, BBPOS_MAX))
+			break;
+
+		start = bbpos_successor(end);
+	}
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644
index 000000000000..ab866feeaf66
--- /dev/null
+++ b/fs/bcachefs/backpointers.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+static inline u64 swab40(u64 x)
+{
+	return (((x & 0x00000000ffULL) << 32)|
+		((x & 0x000000ff00ULL) << 16)|
+		((x & 0x0000ff0000ULL) >>  0)|
+		((x & 0x00ff000000ULL) >> 16)|
+		((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
+	.key_invalid	= bch2_backpointer_invalid,	\
+	.val_to_text	= bch2_backpointer_k_to_text,	\
+	.swab		= bch2_backpointer_swab,	\
+	.min_val_size	= 32,				\
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+					   struct bpos bp_pos)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+	u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+	return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+					   struct bpos bucket,
+					   u64 bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+	struct bpos ret;
+
+	ret = POS(bucket.inode,
+		  (bucket_to_sector(ca, bucket.offset) <<
+		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+	return ret;
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+				struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+				struct bpos bucket,
+				struct bch_backpointer bp,
+				struct bkey_s_c orig_k,
+				bool insert)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_backpointer *bp_k;
+	int ret;
+
+	bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+	ret = PTR_ERR_OR_ZERO(bp_k);
+	if (ret)
+		return ret;
+
+	bkey_backpointer_init(&bp_k->k_i);
+	bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
+	bp_k->v = bp;
+
+	if (!insert) {
+		bp_k->k.type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(&bp_k->k, 0);
+	}
+
+	if (unlikely(bch2_backpointers_no_use_write_buffer))
+		return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
+
+	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
+						    struct bkey_s_c k, struct extent_ptr_decoded p)
+{
+	return  level		? BCH_DATA_btree :
+		p.has_ec	? BCH_DATA_stripe :
+				  BCH_DATA_user;
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+	s64 sectors = level ? btree_sectors(c) : k.k->size;
+	u32 bucket_offset;
+
+	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+	*bp = (struct bch_backpointer) {
+		.btree_id	= btree_id,
+		.level		= level,
+		.data_type	= data_type,
+		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+			p.crc.offset,
+		.bucket_len	= ptr_disk_sectors(sectors, p),
+		.pos		= k.k->p,
+	};
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+			      struct bpos *, struct bch_backpointer *, unsigned);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+					 struct bpos, struct bch_backpointer,
+					 unsigned);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+					struct bpos, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
new file mode 100644
index 000000000000..be2edced5213
--- /dev/null
+++ b/fs/bcachefs/bbpos.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_H
+#define _BCACHEFS_BBPOS_H
+
+#include "bbpos_types.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+
+static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
+{
+	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
+}
+
+static inline struct bbpos bbpos_successor(struct bbpos pos)
+{
+	if (bpos_cmp(pos.pos, SPOS_MAX)) {
+		pos.pos = bpos_successor(pos.pos);
+		return pos;
+	}
+
+	if (pos.btree != BTREE_ID_NR) {
+		pos.btree++;
+		pos.pos = POS_MIN;
+		return pos;
+	}
+
+	BUG();
+}
+
+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
+{
+	prt_str(out, bch2_btree_id_str(pos.btree));
+	prt_char(out, ':');
+	bch2_bpos_to_text(out, pos.pos);
+}
+
+#endif /* _BCACHEFS_BBPOS_H */
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
new file mode 100644
index 000000000000..5198e94cf3b8
--- /dev/null
+++ b/fs/bcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+	enum btree_id		btree;
+	struct bpos		pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+	return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN	BBPOS(0, POS_MIN)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
new file mode 100644
index 000000000000..b62737fdf5ab
--- /dev/null
+++ b/fs/bcachefs/bcachefs.h
@@ -0,0 +1,1164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#ifdef __KERNEL__
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/semaphore.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "errcode.h"
+#include "fifo.h"
+#include "nocow_locking_types.h"
+#include "opts.h"
+#include "recovery_types.h"
+#include "sb-errors_types.h"
+#include "seqmutex.h"
+#include "util.h"
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_WRITE_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...)		0
+#endif
+
+#define race_fault(...)			dynamic_fault("bcachefs:race")
+
+#define trace_and_count(_c, _name, ...)					\
+do {									\
+	this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]);		\
+	trace_##_name(__VA_ARGS__);					\
+} while (0)
+
+#define bch2_fs_init_fault(name)					\
+	dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)					\
+	 dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)					\
+	 dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define BCACHEFS_LOG_PREFIX
+#endif
+
+#ifdef BCACHEFS_LOG_PREFIX
+
+#define bch2_log_msg(_c, fmt)			"bcachefs (%s): " fmt, ((_c)->name)
+#define bch2_fmt_dev(_ca, fmt)			"bcachefs (%s): " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)			\
+	 "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset)
+
+#else
+
+#define bch2_log_msg(_c, fmt)			fmt
+#define bch2_fmt_dev(_ca, fmt)			"%s: " fmt "\n", ((_ca)->name)
+#define bch2_fmt_dev_offset(_ca, _offset, fmt)	"%s sector %llu: " fmt "\n", ((_ca)->name), (_offset)
+#define bch2_fmt_inum(_c, _inum, fmt)		"inum %llu: " fmt "\n", (_inum)
+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt)				\
+	 "inum %llu offset %llu: " fmt "\n", (_inum), (_offset)
+
+#endif
+
+#define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev(ca, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset(ca, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum(c, _inum, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
+	printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_ratelimited(c, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_dev_ratelimited(ca, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
+	printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+#define bch_err_fn(_c, _ret)						\
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...)				\
+do {									\
+	if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+		bch_err(_c, "%s(): error " _msg " %s", __func__,	\
+			##__VA_ARGS__, bch2_err_str(_ret));		\
+} while (0)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose)						\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)					\
+do {									\
+	if (opt_get(opts, verbose))					\
+		pr_info(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+	BCH_DEBUG_PARAM(verify_all_btree_replicas,			\
+		"When reading btree nodes, read all replicas and "	\
+		"compare them")						\
+	BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,		\
+		"Don't use the write buffer for backpointers, enabling "\
+		"extra runtime checks")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_iterators,				\
+		"Enables extra verification for btree iterators")	\
+	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
+		"Verify btree accounting for keys within a node")	\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(test_alloc_startup,				\
+		"Force allocator startup to use the slowpath where it"	\
+		"can't find enough free buckets without invalidating"	\
+		"cached data")						\
+	BCH_DEBUG_PARAM(force_reconstruct_read,				\
+		"Force reads to use the reconstruct path, when reading"	\
+		"from erasure coded extents")				\
+	BCH_DEBUG_PARAM(test_restart_gc,				\
+		"Test restarting mark and sweep gc when bucket gens change")
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
+#define BCH_TIME_STATS()			\
+	x(btree_node_mem_alloc)			\
+	x(btree_node_split)			\
+	x(btree_node_compact)			\
+	x(btree_node_merge)			\
+	x(btree_node_sort)			\
+	x(btree_node_read)			\
+	x(btree_interior_update_foreground)	\
+	x(btree_interior_update_total)		\
+	x(btree_gc)				\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_flush_write)			\
+	x(journal_noflush_write)		\
+	x(journal_flush_seq)			\
+	x(blocked_journal)			\
+	x(blocked_allocate)			\
+	x(blocked_allocate_open_bucket)		\
+	x(nocow_lock_contended)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "replicas_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE	(BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
+struct btree;
+
+enum gc_phase {
+	GC_PHASE_NOT_RUNNING,
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+	GC_PHASE_BTREE_stripes,
+	GC_PHASE_BTREE_extents,
+	GC_PHASE_BTREE_inodes,
+	GC_PHASE_BTREE_dirents,
+	GC_PHASE_BTREE_xattrs,
+	GC_PHASE_BTREE_alloc,
+	GC_PHASE_BTREE_quotas,
+	GC_PHASE_BTREE_reflink,
+	GC_PHASE_BTREE_subvolumes,
+	GC_PHASE_BTREE_snapshots,
+	GC_PHASE_BTREE_lru,
+	GC_PHASE_BTREE_freespace,
+	GC_PHASE_BTREE_need_discard,
+	GC_PHASE_BTREE_backpointers,
+	GC_PHASE_BTREE_bucket_gens,
+	GC_PHASE_BTREE_snapshot_trees,
+	GC_PHASE_BTREE_deleted_inodes,
+	GC_PHASE_BTREE_logged_ops,
+	GC_PHASE_BTREE_rebalance_work,
+
+	GC_PHASE_PENDING_DELETE,
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct reflink_gc {
+	u64		offset;
+	u32		size;
+	u32		refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+	struct kobject		kobj;
+	struct percpu_ref	ref;
+	struct completion	ref_completion;
+	struct percpu_ref	io_ref;
+	struct completion	io_ref_completion;
+
+	struct bch_fs		*fs;
+
+	u8			dev_idx;
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by bch2_write_super() -> bch_fs_mi_update()
+	 */
+	struct bch_member_cpu	mi;
+	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
+
+	__uuid_t		uuid;
+	char			name[BDEVNAME_SIZE];
+
+	struct bch_sb_handle	disk_sb;
+	struct bch_sb		*sb_read_scratch;
+	int			sb_write_error;
+	dev_t			dev;
+	atomic_t		flush_seq;
+
+	struct bch_devs_mask	self;
+
+	/* biosets used in cloned bios for writing multiple replicas */
+	struct bio_set		replica_set;
+
+	/*
+	 * Buckets:
+	 * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+	 * gc_lock, for device resize - holding any is sufficient for access:
+	 * Or rcu_read_lock(), but only for ptr_stale():
+	 */
+	struct bucket_array __rcu *buckets_gc;
+	struct bucket_gens __rcu *bucket_gens;
+	u8			*oldest_gen;
+	unsigned long		*buckets_nouse;
+	struct rw_semaphore	bucket_lock;
+
+	struct bch_dev_usage		*usage_base;
+	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage_gc;
+
+	/* Allocator: */
+	u64			new_fs_bucket_idx;
+	u64			alloc_cursor;
+
+	unsigned		nr_open_buckets;
+	unsigned		nr_btree_reserve;
+
+	size_t			inc_gen_needs_gc;
+	size_t			inc_gen_really_needs_gc;
+	size_t			buckets_waiting_on_journal;
+
+	atomic64_t		rebalance_work;
+
+	struct journal_device	journal;
+	u64			prev_journal_sector;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+	atomic64_t		cur_latency[2];
+	struct bch2_time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
+
+	struct io_count __percpu *io_done;
+};
+
+enum {
+	/* startup: */
+	BCH_FS_STARTED,
+	BCH_FS_MAY_GO_RW,
+	BCH_FS_RW,
+	BCH_FS_WAS_RW,
+
+	/* shutdown: */
+	BCH_FS_STOPPING,
+	BCH_FS_EMERGENCY_RO,
+	BCH_FS_GOING_RO,
+	BCH_FS_WRITE_DISABLE_COMPLETE,
+	BCH_FS_CLEAN_SHUTDOWN,
+
+	/* fsck passes: */
+	BCH_FS_FSCK_DONE,
+	BCH_FS_INITIAL_GC_UNFIXED,	/* kill when we enumerate fsck errors */
+	BCH_FS_NEED_ANOTHER_GC,
+
+	BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
+
+	/* errors: */
+	BCH_FS_ERROR,
+	BCH_FS_TOPOLOGY_ERROR,
+	BCH_FS_ERRORS_FIXED,
+	BCH_FS_ERRORS_NOT_FIXED,
+};
+
+struct btree_debug {
+	unsigned		id;
+};
+
+#define BCH_TRANSACTIONS_NR 128
+
+struct btree_transaction_stats {
+	struct bch2_time_stats	lock_hold_times;
+	struct mutex		lock;
+	unsigned		nr_max_paths;
+	unsigned		wb_updates_size;
+	unsigned		max_mem;
+	char			*max_paths_text;
+};
+
+struct bch_fs_pcpu {
+	u64			sectors_available;
+};
+
+struct journal_seq_blacklist_table {
+	size_t			nr;
+	struct journal_seq_blacklist_table_entry {
+		u64		start;
+		u64		end;
+		bool		dirty;
+	}			entries[];
+};
+
+struct journal_keys {
+	struct journal_key {
+		u64		journal_seq;
+		u32		journal_offset;
+		enum btree_id	btree_id:8;
+		unsigned	level:8;
+		bool		allocated;
+		bool		overwritten;
+		struct bkey_i	*k;
+	}			*d;
+	/*
+	 * Gap buffer: instead of all the empty space in the array being at the
+	 * end of the buffer - from @nr to @size - the empty space is at @gap.
+	 * This means that sequential insertions are O(n) instead of O(n^2).
+	 */
+	size_t			gap;
+	size_t			nr;
+	size_t			size;
+	atomic_t		ref;
+	bool			initial_ref_held;
+};
+
+struct btree_trans_buf {
+	struct btree_trans	*trans;
+};
+
+#define REPLICAS_DELTA_LIST_MAX	(1U << 16)
+
+#define BCACHEFS_ROOT_SUBVOL_INUM					\
+	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
+
+#define BCH_WRITE_REFS()						\
+	x(trans)							\
+	x(write)							\
+	x(promote)							\
+	x(node_rewrite)							\
+	x(stripe_create)						\
+	x(stripe_delete)						\
+	x(reflink)							\
+	x(fallocate)							\
+	x(discard)							\
+	x(invalidate)							\
+	x(delete_dead_snapshots)					\
+	x(snapshot_delete_pagecache)					\
+	x(sysfs)
+
+enum bch_write_ref {
+#define x(n) BCH_WRITE_REF_##n,
+	BCH_WRITE_REFS()
+#undef x
+	BCH_WRITE_REF_NR,
+};
+
+struct bch_fs {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		counters_kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	dev_t			dev;
+	char			name[40];
+
+	/* ro/rw, add/remove/resize devices: */
+	struct rw_semaphore	state_lock;
+
+	/* Counts outstanding writes, for clean transition to read-only */
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_t		writes[BCH_WRITE_REF_NR];
+#else
+	struct percpu_ref	writes;
+#endif
+	struct work_struct	read_only_work;
+
+	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
+
+	struct bch_replicas_cpu replicas;
+	struct bch_replicas_cpu replicas_gc;
+	struct mutex		replicas_gc_lock;
+	mempool_t		replicas_delta_pool;
+
+	struct journal_entry_res btree_root_journal_res;
+	struct journal_entry_res replicas_journal_res;
+	struct journal_entry_res clock_journal_res;
+	struct journal_entry_res dev_usage_journal_res;
+
+	struct bch_disk_groups_cpu __rcu *disk_groups;
+
+	struct bch_opts		opts;
+
+	/* Updated by bch2_sb_update():*/
+	struct {
+		__uuid_t	uuid;
+		__uuid_t	user_uuid;
+
+		u16		version;
+		u16		version_min;
+		u16		version_upgrade_complete;
+
+		u8		nr_devices;
+		u8		clean;
+
+		u8		encryption_type;
+
+		u64		time_base_lo;
+		u32		time_base_hi;
+		unsigned	time_units_per_sec;
+		unsigned	nsec_per_time_unit;
+		u64		features;
+		u64		compat;
+		unsigned long	errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+	}			sb;
+
+
+	struct bch_sb_handle	disk_sb;
+
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	u16			btree_foreground_merge_threshold;
+
+	struct closure		sb_write;
+	struct mutex		sb_lock;
+
+	/* snapshot.c: */
+	struct snapshot_table __rcu *snapshots;
+	size_t			snapshot_table_size;
+	struct mutex		snapshot_table_lock;
+	struct rw_semaphore	snapshot_create_lock;
+
+	struct work_struct	snapshot_delete_work;
+	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
+	snapshot_id_list	snapshots_unlinked;
+	struct mutex		snapshots_unlinked_lock;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_bio;
+	struct workqueue_struct	*io_complete_wq;
+
+	struct btree_root	btree_roots_known[BTREE_ID_NR];
+	DARRAY(struct btree_root) btree_roots_extra;
+	struct mutex		btree_root_lock;
+
+	struct btree_cache	btree_cache;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct list_head	btree_interior_updates_unwritten;
+	struct mutex		btree_interior_update_lock;
+	struct closure_waitlist	btree_interior_update_wait;
+
+	struct workqueue_struct	*btree_interior_update_worker;
+	struct work_struct	btree_interior_update_work;
+
+	struct list_head	pending_node_rewrites;
+	struct mutex		pending_node_rewrites_lock;
+
+	/* btree_io.c: */
+	spinlock_t		btree_write_error_lock;
+	struct btree_write_stats {
+		atomic64_t	nr;
+		atomic64_t	bytes;
+	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
+
+	/* btree_iter.c: */
+	struct seqmutex		btree_trans_lock;
+	struct list_head	btree_trans_list;
+	mempool_t		btree_trans_pool;
+	mempool_t		btree_trans_mem_pool;
+	struct btree_trans_buf  __percpu	*btree_trans_bufs;
+
+	struct srcu_struct	btree_trans_barrier;
+	bool			btree_trans_barrier_initialized;
+
+	struct btree_key_cache	btree_key_cache;
+	unsigned		btree_key_cache_btrees;
+
+	struct btree_write_buffer btree_write_buffer;
+
+	struct workqueue_struct	*btree_update_wq;
+	struct workqueue_struct	*btree_io_complete_wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+	/*
+	 * Use a dedicated wq for write ref holder tasks. Required to avoid
+	 * dependency problems with other wq tasks that can block on ref
+	 * draining, such as read-only transition.
+	 */
+	struct workqueue_struct *write_ref_wq;
+
+	/* ALLOCATION */
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+	unsigned		bucket_size_max;
+
+	atomic64_t		sectors_available;
+	struct mutex		sectors_available_lock;
+
+	struct bch_fs_pcpu __percpu	*pcpu;
+
+	struct percpu_rw_semaphore	mark_lock;
+
+	seqcount_t			usage_lock;
+	struct bch_fs_usage		*usage_base;
+	struct bch_fs_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_fs_usage __percpu	*usage_gc;
+	u64 __percpu		*online_reserved;
+
+	/* single element mempool: */
+	struct mutex		usage_scratch_lock;
+	struct bch_fs_usage_online *usage_scratch;
+
+	struct io_clock		io_clock[2];
+
+	/* JOURNAL SEQ BLACKLIST */
+	struct journal_seq_blacklist_table *
+				journal_seq_blacklist_table;
+	struct work_struct	journal_seq_blacklist_gc_work;
+
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
+	struct closure_waitlist	freelist_wait;
+	u64			blocked_allocate;
+	u64			blocked_allocate_open_bucket;
+
+	open_bucket_idx_t	open_buckets_freelist;
+	open_bucket_idx_t	open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
+
+	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
+	open_bucket_idx_t	open_buckets_partial_nr;
+
+	struct write_point	btree_write_point;
+	struct write_point	rebalance_write_point;
+
+	struct write_point	write_points[WRITE_POINT_MAX];
+	struct hlist_head	write_points_hash[WRITE_POINT_HASH_NR];
+	struct mutex		write_points_hash_lock;
+	unsigned		write_points_nr;
+
+	struct buckets_waiting_for_journal buckets_waiting_for_journal;
+	struct work_struct	discard_work;
+	struct work_struct	invalidate_work;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+	unsigned long		gc_count;
+
+	enum btree_id		gc_gens_btree;
+	struct bpos		gc_gens_pos;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+	struct mutex		gc_gens_lock;
+
+	/* IO PATH */
+	struct semaphore	io_in_flight;
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	struct bucket_nocow_lock_table
+				nocow_locks;
+	struct rhashtable	promote_table;
+
+	mempool_t		compression_bounce[2];
+	mempool_t		compress_workspace[BCH_COMPRESSION_TYPE_NR];
+	mempool_t		decompress_workspace;
+	size_t			zstd_workspace_size;
+
+	struct crypto_shash	*sha256;
+	struct crypto_sync_skcipher *chacha20;
+	struct crypto_shash	*poly1305;
+
+	atomic64_t		key_version;
+
+	mempool_t		large_bkey_pool;
+
+	/* MOVE.C */
+	struct list_head	moving_context_list;
+	struct mutex		moving_context_lock;
+
+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
+	/* COPYGC */
+	struct task_struct	*copygc_thread;
+	struct write_point	copygc_write_point;
+	s64			copygc_wait_at;
+	s64			copygc_wait;
+	bool			copygc_running;
+	wait_queue_head_t	copygc_running_wq;
+
+	/* STRIPES: */
+	GENRADIX(struct stripe) stripes;
+	GENRADIX(struct gc_stripe) gc_stripes;
+
+	struct hlist_head	ec_stripes_new[32];
+	spinlock_t		ec_stripes_new_lock;
+
+	ec_stripes_heap		ec_stripes_heap;
+	struct mutex		ec_stripes_heap_lock;
+
+	/* ERASURE CODING */
+	struct list_head	ec_stripe_head_list;
+	struct mutex		ec_stripe_head_lock;
+
+	struct list_head	ec_stripe_new_list;
+	struct mutex		ec_stripe_new_lock;
+	wait_queue_head_t	ec_stripe_new_wait;
+
+	struct work_struct	ec_stripe_create_work;
+	u64			ec_stripe_hint;
+
+	struct work_struct	ec_stripe_delete_work;
+
+	struct bio_set		ec_bioset;
+
+	/* REFLINK */
+	reflink_gc_table	reflink_gc_table;
+	size_t			reflink_gc_nr;
+
+	/* fs.c */
+	struct list_head	vfs_inodes_list;
+	struct mutex		vfs_inodes_lock;
+
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+	struct bio_set		nocow_flush_bioset;
+
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
+	/* RECOVERY */
+	u64			journal_replay_seq_start;
+	u64			journal_replay_seq_end;
+	enum bch_recovery_pass	curr_recovery_pass;
+	/* bitmap of explicitly enabled recovery passes: */
+	u64			recovery_passes_explicit;
+	u64			recovery_passes_complete;
+
+	/* DEBUG JUNK */
+	struct dentry		*fs_debug_dir;
+	struct dentry		*btree_debug_dir;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+
+	u64			*unused_inode_hints;
+	unsigned		inode_shard_bits;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+	GENRADIX(struct journal_replay *) journal_entries;
+	u64			journal_entries_base_seq;
+	struct journal_keys	journal_keys;
+	struct list_head	journal_iters;
+
+	u64			last_bucket_seq_cleanup;
+
+	u64			counters_on_mount[BCH_COUNTER_NR];
+	u64 __percpu		*counters;
+
+	unsigned		btree_gc_periodic:1;
+	unsigned		copy_gc_enabled:1;
+	bool			promote_whole_extents;
+
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+
+	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+	/* ERRORS */
+	struct list_head	fsck_error_msgs;
+	struct mutex		fsck_error_msgs_lock;
+	bool			fsck_alloc_msgs_err;
+
+	bch_sb_errors_cpu	fsck_error_counts;
+	struct mutex		fsck_error_counts_lock;
+};
+
+extern struct wait_queue_head bch2_read_only_wait;
+
+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	atomic_long_inc(&c->writes[ref]);
+#else
+	percpu_ref_get(&c->writes);
+#endif
+}
+
+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget_live(&c->writes);
+#endif
+}
+
+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	long v = atomic_long_dec_return(&c->writes[ref]);
+
+	BUG_ON(v < 0);
+	if (v)
+		return;
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		if (atomic_long_read(&c->writes[i]))
+			return;
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch2_read_only_wait);
+#else
+	percpu_ref_put(&c->writes);
+#endif
+}
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+	return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+	return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+	return c->btree_key_cache_btrees & (1U << btree);
+}
+
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
+{
+	struct timespec64 t;
+	s32 rem;
+
+	time += c->sb.time_base_lo;
+
+	t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+	t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+	return t;
+}
+
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
+{
+	return (ts.tv_sec * c->sb.time_units_per_sec +
+		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
+}
+
+static inline s64 bch2_current_time(const struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_coarse_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+	return dev < c->sb.nr_devices && c->devs[dev];
+}
+
+#define BKEY_PADDED_ONSTACK(key, pad)				\
+	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+
+#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
new file mode 100644
index 000000000000..fe78e87603fc
--- /dev/null
+++ b/fs/bcachefs/bcachefs_format.h
@@ -0,0 +1,2454 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FORMAT_H
+#define _BCACHEFS_FORMAT_H
+
+/*
+ * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
+ */
+
+#include <asm/types.h>
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/uuid.h>
+#include "vstructs.h"
+
+#ifdef __KERNEL__
+typedef uuid_t __uuid_t;
+#endif
+
+#define BITMASK(name, type, field, offset, end)				\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
+	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
+}
+
+#define LE_BITMASK(_bits, name, type, field, offset, end)		\
+static const __maybe_unused unsigned	name##_OFFSET = offset;		\
+static const __maybe_unused unsigned	name##_BITS = (end - offset);	\
+static const __maybe_unused __u##_bits	name##_MAX = (1ULL << (end - offset)) - 1;\
+									\
+static inline __u64 name(const type *k)					\
+{									\
+	return (__le##_bits##_to_cpu(k->field) >> offset) &		\
+		~(~0ULL << (end - offset));				\
+}									\
+									\
+static inline void SET_##name(type *k, __u64 v)				\
+{									\
+	__u##_bits new = __le##_bits##_to_cpu(k->field);		\
+									\
+	new &= ~(~(~0ULL << (end - offset)) << offset);			\
+	new |= (v & ~(~0ULL << (end - offset))) << offset;		\
+	k->field = __cpu_to_le##_bits(new);				\
+}
+
+#define LE16_BITMASK(n, t, f, o, e)	LE_BITMASK(16, n, t, f, o, e)
+#define LE32_BITMASK(n, t, f, o, e)	LE_BITMASK(32, n, t, f, o, e)
+#define LE64_BITMASK(n, t, f, o, e)	LE_BITMASK(64, n, t, f, o, e)
+
+struct bkey_format {
+	__u8		key_u64s;
+	__u8		nr_fields;
+	/* One unused slot for now: */
+	__u8		bits_per_field[6];
+	__le64		field_offset[6];
+};
+
+/* Btree keys - all units are in sectors */
+
+struct bpos {
+	/*
+	 * Word order matches machine byte order - btree code treats a bpos as a
+	 * single large integer, for search/comparison purposes
+	 *
+	 * Note that wherever a bpos is embedded in another on disk data
+	 * structure, it has to be byte swabbed when reading in metadata that
+	 * wasn't written in native endian order:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u32		snapshot;
+	__u64		offset;
+	__u64		inode;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u64		inode;
+	__u64		offset;		/* Points to end of extent - sectors */
+	__u32		snapshot;
+#else
+#error edit for your odd byteorder.
+#endif
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
+
+#define KEY_INODE_MAX			((__u64)~0ULL)
+#define KEY_OFFSET_MAX			((__u64)~0ULL)
+#define KEY_SNAPSHOT_MAX		((__u32)~0U)
+#define KEY_SIZE_MAX			((__u32)~0U)
+
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
+{
+	return (struct bpos) {
+		.inode		= inode,
+		.offset		= offset,
+		.snapshot	= snapshot,
+	};
+}
+
+#define POS_MIN				SPOS(0, 0, 0)
+#define POS_MAX				SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
+#define SPOS_MAX			SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)		SPOS(_inode, _offset, 0)
+
+/* Empty placeholder struct, for container_of() */
+struct bch_val {
+	__u64		__nothing[0];
+};
+
+struct bversion {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u64		lo;
+	__u32		hi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	__u32		hi;
+	__u64		lo;
+#endif
+} __packed __aligned(4);
+
+struct bkey {
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#else
+#error edit for your odd byteorder.
+#endif
+
+	/* Type of the value */
+	__u8		type;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	__u8		pad[1];
+
+	struct bversion	version;
+	__u32		size;		/* extent size, in sectors */
+	struct bpos	p;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	struct bpos	p;
+	__u32		size;		/* extent size, in sectors */
+	struct bversion	version;
+
+	__u8		pad[1];
+#endif
+} __packed __aligned(8);
+
+struct bkey_packed {
+	__u64		_data[0];
+
+	/* Size of combined key and value, in u64s */
+	__u8		u64s;
+
+	/* Format of key (0 for format local to btree node) */
+
+	/*
+	 * XXX: next incompat on disk format change, switch format and
+	 * needs_whiteout - bkey_packed() will be cheaper if format is the high
+	 * bits of the bitfield
+	 */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8		format:7,
+			needs_whiteout:1;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u8		needs_whiteout:1,
+			format:7;
+#endif
+
+	/* Type of the value */
+	__u8		type;
+	__u8		key_start[0];
+
+	/*
+	 * We copy bkeys with struct assignment in various places, and while
+	 * that shouldn't be done with packed bkeys we can't disallow it in C,
+	 * and it's legal to cast a bkey to a bkey_packed  - so padding it out
+	 * to the same size as struct bkey should hopefully be safest.
+	 */
+	__u8		pad[sizeof(struct bkey) - 3];
+} __packed __aligned(8);
+
+typedef struct {
+	__le64			lo;
+	__le64			hi;
+} bch_le128;
+
+#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX			U8_MAX
+#define BKEY_VAL_U64s_MAX		(BKEY_U64s_MAX - BKEY_U64s)
+
+#define KEY_PACKED_BITS_START		24
+
+#define KEY_FORMAT_LOCAL_BTREE		0
+#define KEY_FORMAT_CURRENT		1
+
+enum bch_bkey_fields {
+	BKEY_FIELD_INODE,
+	BKEY_FIELD_OFFSET,
+	BKEY_FIELD_SNAPSHOT,
+	BKEY_FIELD_SIZE,
+	BKEY_FIELD_VERSION_HI,
+	BKEY_FIELD_VERSION_LO,
+	BKEY_NR_FIELDS,
+};
+
+#define bkey_format_field(name, field)					\
+	[BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
+
+#define BKEY_FORMAT_CURRENT						\
+((struct bkey_format) {							\
+	.key_u64s	= BKEY_U64s,					\
+	.nr_fields	= BKEY_NR_FIELDS,				\
+	.bits_per_field = {						\
+		bkey_format_field(INODE,	p.inode),		\
+		bkey_format_field(OFFSET,	p.offset),		\
+		bkey_format_field(SNAPSHOT,	p.snapshot),		\
+		bkey_format_field(SIZE,		size),			\
+		bkey_format_field(VERSION_HI,	version.hi),		\
+		bkey_format_field(VERSION_LO,	version.lo),		\
+	},								\
+})
+
+/* bkey with inline value */
+struct bkey_i {
+	__u64			_data[0];
+
+	struct bkey	k;
+	struct bch_val	v;
+};
+
+#define KEY(_inode, _offset, _size)					\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= POS(_inode, _offset),				\
+	.size		= _size,					\
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = KEY(0, 0, 0);
+}
+
+#define bkey_bytes(_k)		((_k)->u64s * sizeof(__u64))
+
+#define __BKEY_PADDED(key, pad)					\
+	struct bkey_i key; __u64 key ## _pad[pad]
+
+/*
+ * - DELETED keys are used internally to mark keys that should be ignored but
+ *   override keys in composition order.  Their version number is ignored.
+ *
+ * - DISCARDED keys indicate that the data is all 0s because it has been
+ *   discarded. DISCARDs may have a version; if the version is nonzero the key
+ *   will be persistent, otherwise the key will be dropped whenever the btree
+ *   node is rewritten (like DELETED keys).
+ *
+ * - ERROR: any read of the data returns a read error, as the data was lost due
+ *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
+ *   by new writes or cluster-wide GC. Node repair can also overwrite them with
+ *   the same or a more recent version number, but not with an older version
+ *   number.
+ *
+ * - WHITEOUT: for hash table btrees
+ */
+#define BCH_BKEY_TYPES()				\
+	x(deleted,		0)			\
+	x(whiteout,		1)			\
+	x(error,		2)			\
+	x(cookie,		3)			\
+	x(hash_whiteout,	4)			\
+	x(btree_ptr,		5)			\
+	x(extent,		6)			\
+	x(reservation,		7)			\
+	x(inode,		8)			\
+	x(inode_generation,	9)			\
+	x(dirent,		10)			\
+	x(xattr,		11)			\
+	x(alloc,		12)			\
+	x(quota,		13)			\
+	x(stripe,		14)			\
+	x(reflink_p,		15)			\
+	x(reflink_v,		16)			\
+	x(inline_data,		17)			\
+	x(btree_ptr_v2,		18)			\
+	x(indirect_inline_data,	19)			\
+	x(alloc_v2,		20)			\
+	x(subvolume,		21)			\
+	x(snapshot,		22)			\
+	x(inode_v2,		23)			\
+	x(alloc_v3,		24)			\
+	x(set,			25)			\
+	x(lru,			26)			\
+	x(alloc_v4,		27)			\
+	x(backpointer,		28)			\
+	x(inode_v3,		29)			\
+	x(bucket_gens,		30)			\
+	x(snapshot_tree,	31)			\
+	x(logged_op_truncate,	32)			\
+	x(logged_op_finsert,	33)
+
+enum bch_bkey_type {
+#define x(name, nr) KEY_TYPE_##name	= nr,
+	BCH_BKEY_TYPES()
+#undef x
+	KEY_TYPE_MAX,
+};
+
+struct bch_deleted {
+	struct bch_val		v;
+};
+
+struct bch_whiteout {
+	struct bch_val		v;
+};
+
+struct bch_error {
+	struct bch_val		v;
+};
+
+struct bch_cookie {
+	struct bch_val		v;
+	__le64			cookie;
+};
+
+struct bch_hash_whiteout {
+	struct bch_val		v;
+};
+
+struct bch_set {
+	struct bch_val		v;
+};
+
+/* Extents */
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32	- 0b1
+ * bch_extent_ptr	- 0b10
+ * bch_extent_crc64	- 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+	__le64			lo;
+	__le64			hi;
+} __packed __aligned(8);
+
+#define BCH_EXTENT_ENTRY_TYPES()		\
+	x(ptr,			0)		\
+	x(crc32,		1)		\
+	x(crc64,		2)		\
+	x(crc128,		3)		\
+	x(stripe_ptr,		4)		\
+	x(rebalance,		5)
+#define BCH_EXTENT_ENTRY_MAX	6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32			type:2,
+				_compressed_size:7,
+				_uncompressed_size:7,
+				offset:7,
+				_unused:1,
+				csum_type:4,
+				compression_type:4;
+	__u32			csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u32			csum;
+	__u32			compression_type:4,
+				csum_type:4,
+				_unused:1,
+				offset:7,
+				_uncompressed_size:7,
+				_compressed_size:7,
+				type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX		(1U << 7)
+#define CRC32_NONCE_MAX		0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:3,
+				_compressed_size:9,
+				_uncompressed_size:9,
+				offset:9,
+				nonce:10,
+				csum_type:4,
+				compression_type:4,
+				csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			csum_hi:16,
+				compression_type:4,
+				csum_type:4,
+				nonce:10,
+				offset:9,
+				_uncompressed_size:9,
+				_compressed_size:9,
+				type:3;
+#endif
+	__u64			csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX		(1U << 9)
+#define CRC64_NONCE_MAX		((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:4,
+				_compressed_size:13,
+				_uncompressed_size:13,
+				offset:13,
+				nonce:13,
+				csum_type:4,
+				compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			compression_type:4,
+				csum_type:4,
+				nonce:13,
+				offset:13,
+				_uncompressed_size:13,
+				_compressed_size:13,
+				type:4;
+#endif
+	struct bch_csum		csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX		(1U << 13)
+#define CRC128_NONCE_MAX	((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:1,
+				cached:1,
+				unused:1,
+				unwritten:1,
+				offset:44, /* 8 petabytes */
+				dev:8,
+				gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			gen:8,
+				dev:8,
+				offset:44,
+				unwritten:1,
+				unused:1,
+				cached:1,
+				type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:5,
+				block:8,
+				redundancy:4,
+				idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:47,
+				redundancy:4,
+				block:8,
+				type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:6,
+				unused:34,
+				compression:8, /* enum bch_compression_opt */
+				target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			target:16,
+				compression:8,
+				unused:34,
+				type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+	unsigned long			type;
+#elif __BITS_PER_LONG == 32
+	struct {
+		unsigned long		pad;
+		unsigned long		type;
+	};
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f	f;
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+	struct bch_val		v;
+
+	__u64			mem_ptr;
+	__le64			seq;
+	__le16			sectors_written;
+	__le16			flags;
+	struct bpos		min_key;
+	__u64			_data[0];
+	struct bch_extent_ptr	start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+	struct bch_val		v;
+
+	__u64			_data[0];
+	union bch_extent_entry	start[];
+} __packed __aligned(8);
+
+struct bch_reservation {
+	struct bch_val		v;
+
+	__le32			generation;
+	__u8			nr_replicas;
+	__u8			pad[3];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+	((sizeof(struct bch_extent_crc128) +			\
+	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX				\
+	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX				\
+	((sizeof(struct bch_btree_ptr_v2) +			\
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX					\
+	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+/* Inodes */
+
+#define BLOCKDEV_INODE_MAX	4096
+
+#define BCACHEFS_ROOT_INO	4096
+
+struct bch_inode {
+	struct bch_val		v;
+
+	__le64			bi_hash_seed;
+	__le32			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le16			bi_mode;
+	__u8			fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+	struct bch_val		v;
+
+	__le64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	__le64			bi_flags;
+	__le64			bi_sectors;
+	__le64			bi_size;
+	__le64			bi_version;
+	__u8			fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL	6
+#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+	struct bch_val		v;
+
+	__le32			bi_generation;
+	__le32			pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_size,			64)	\
+	x(bi_sectors,			64)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)
+
+#define BCH_INODE_FIELDS_v3()			\
+	x(bi_atime,			96)	\
+	x(bi_ctime,			96)	\
+	x(bi_mtime,			96)	\
+	x(bi_otime,			96)	\
+	x(bi_uid,			32)	\
+	x(bi_gid,			32)	\
+	x(bi_nlink,			32)	\
+	x(bi_generation,		32)	\
+	x(bi_dev,			32)	\
+	x(bi_data_checksum,		8)	\
+	x(bi_compression,		8)	\
+	x(bi_project,			32)	\
+	x(bi_background_compression,	8)	\
+	x(bi_data_replicas,		8)	\
+	x(bi_promote_target,		16)	\
+	x(bi_foreground_target,		16)	\
+	x(bi_background_target,		16)	\
+	x(bi_erasure_code,		16)	\
+	x(bi_fields_set,		16)	\
+	x(bi_dir,			64)	\
+	x(bi_dir_offset,		64)	\
+	x(bi_subvol,			32)	\
+	x(bi_parent_subvol,		32)	\
+	x(bi_nocow,			8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()			\
+	x(data_checksum,		8)	\
+	x(compression,			8)	\
+	x(project,			32)	\
+	x(background_compression,	8)	\
+	x(data_replicas,		8)	\
+	x(promote_target,		16)	\
+	x(foreground_target,		16)	\
+	x(background_target,		16)	\
+	x(erasure_code,			16)	\
+	x(nocow,			8)
+
+enum inode_opt_id {
+#define x(name, ...)				\
+	Inode_opt_##name,
+	BCH_INODE_OPTS()
+#undef  x
+	Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS()			\
+	x(sync,				0)	\
+	x(immutable,			1)	\
+	x(append,			2)	\
+	x(nodump,			3)	\
+	x(noatime,			4)	\
+	x(i_size_dirty,			5)	\
+	x(i_sectors_dirty,		6)	\
+	x(unlinked,			7)	\
+	x(backptr_untrusted,		8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n)	BCH_INODE_##t = 1U << n,
+	BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n)	__BCH_INODE_##t = n,
+	BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH,	struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,	struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT,	struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+				struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
+
+/* Dirents */
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+	struct bch_val		v;
+
+	/* Target inode number: */
+	union {
+	__le64			d_inum;
+	struct {		/* DT_SUBVOL */
+	__le32			d_child_subvol;
+	__le32			d_parent_subvol;
+	};
+	};
+
+	/*
+	 * Copy of mode bits 12-15 from the target inode - so userspace can get
+	 * the filetype without having to do a stat()
+	 */
+	__u8			d_type;
+
+	__u8			d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL	16
+#define BCH_DT_MAX	17
+
+#define BCH_NAME_MAX	512
+
+/* Xattrs */
+
+#define KEY_TYPE_XATTR_INDEX_USER			0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS	1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT	2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED			3
+#define KEY_TYPE_XATTR_INDEX_SECURITY	        4
+
+struct bch_xattr {
+	struct bch_val		v;
+	__u8			x_type;
+	__u8			x_name_len;
+	__le16			x_val_len;
+	__u8			x_name[];
+} __packed __aligned(8);
+
+/* Bucket/allocation information: */
+
+struct bch_alloc {
+	struct bch_val		v;
+	__u8			fields;
+	__u8			gen;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1()			\
+	x(read_time,		16)		\
+	x(write_time,		16)		\
+	x(data_type,		8)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(oldest_gen,		8)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+	struct bch_val		v;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2()			\
+	x(read_time,		64)		\
+	x(write_time,		64)		\
+	x(dirty_sectors,	32)		\
+	x(cached_sectors,	32)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+struct bch_alloc_v3 {
+	struct bch_val		v;
+	__le64			journal_seq;
+	__le32			flags;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+struct bch_alloc_v4 {
+	struct bch_val		v;
+	__u64			journal_seq;
+	__u32			flags;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			stripe_redundancy;
+	__u32			dirty_sectors;
+	__u32			cached_sectors;
+	__u64			io_time[2];
+	__u32			stripe;
+	__u32			nr_external_backpointers;
+	__u64			fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0	6
+#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
+
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX	40
+
+struct bch_backpointer {
+	struct bch_val		v;
+	__u8			btree_id;
+	__u8			level;
+	__u8			data_type;
+	__u64			bucket_offset:40;
+	__u32			bucket_len;
+	struct bpos		pos;
+} __packed __aligned(8);
+
+#define KEY_TYPE_BUCKET_GENS_BITS	8
+#define KEY_TYPE_BUCKET_GENS_NR		(1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK	(KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+	struct bch_val		v;
+	u8			gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+/* Quotas: */
+
+enum quota_types {
+	QTYP_USR		= 0,
+	QTYP_GRP		= 1,
+	QTYP_PRJ		= 2,
+	QTYP_NR			= 3,
+};
+
+enum quota_counters {
+	Q_SPC			= 0,
+	Q_INO			= 1,
+	Q_COUNTERS		= 2,
+};
+
+struct bch_quota_counter {
+	__le64			hardlimit;
+	__le64			softlimit;
+};
+
+struct bch_quota {
+	struct bch_val		v;
+	struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* Erasure coding */
+
+struct bch_stripe {
+	struct bch_val		v;
+	__le16			sectors;
+	__u8			algorithm;
+	__u8			nr_blocks;
+	__u8			nr_redundant;
+
+	__u8			csum_granularity_bits;
+	__u8			csum_type;
+	__u8			pad;
+
+	struct bch_extent_ptr	ptrs[];
+} __packed __aligned(8);
+
+/* Reflink: */
+
+struct bch_reflink_p {
+	struct bch_val		v;
+	__le64			idx;
+	/*
+	 * A reflink pointer might point to an indirect extent which is then
+	 * later split (by copygc or rebalance). If we only pointed to part of
+	 * the original indirect extent, and then one of the fragments is
+	 * outside the range we point to, we'd leak a refcount: so when creating
+	 * reflink pointers, we need to store pad values to remember the full
+	 * range we were taking a reference on.
+	 */
+	__le32			front_pad;
+	__le32			back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+	struct bch_val		v;
+	__le64			refcount;
+	union bch_extent_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+	struct bch_val		v;
+	__le64			refcount;
+	u8			data[];
+};
+
+/* Inline data */
+
+struct bch_inline_data {
+	struct bch_val		v;
+	u8			data[];
+};
+
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN		POS(0, 1)
+#define SUBVOL_POS_MAX		POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL	1
+
+struct bch_subvolume {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			snapshot;
+	__le64			inode;
+	/*
+	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
+	 * tree - if this subvolume is a snapshot, this is the ID of the
+	 * subvolume it was created from:
+	 */
+	__le32			parent;
+	__le32			pad;
+	bch_le128		otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,		struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,	struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,	struct bch_subvolume, flags,  2,  3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+	struct bch_val		v;
+	__le32			flags;
+	__le32			parent;
+	__le32			children[2];
+	__le32			subvol;
+	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+	__le32			tree;
+	__le32			depth;
+	__le32			skip[3];
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,	struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+	struct bch_val		v;
+	__le32			master_subvol;
+	__le32			root_snapshot;
+};
+
+/* LRU btree: */
+
+struct bch_lru {
+	struct bch_val		v;
+	__le64			idx;
+} __packed __aligned(8);
+
+#define LRU_ID_STRIPES		(1U << 16)
+
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+	struct bch_val		v;
+	__le32			subvol;
+	__le32			pad;
+	__le64			inum;
+	__le64			new_i_size;
+};
+
+enum logged_op_finsert_state {
+	LOGGED_OP_FINSERT_start,
+	LOGGED_OP_FINSERT_shift_extents,
+	LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+	struct bch_val		v;
+	__u8			state;
+	__u8			pad[3];
+	__le32			subvol;
+	__le64			inum;
+	__le64			dst_offset;
+	__le64			src_offset;
+	__le64			pos;
+};
+
+/* Optional/variable size superblock sections: */
+
+struct bch_sb_field {
+	__u64			_data[0];
+	__le32			u64s;
+	__le32			type;
+};
+
+#define BCH_SB_FIELDS()				\
+	x(journal,			0)	\
+	x(members_v1,			1)	\
+	x(crypt,			2)	\
+	x(replicas_v0,			3)	\
+	x(quota,			4)	\
+	x(disk_groups,			5)	\
+	x(clean,			6)	\
+	x(replicas,			7)	\
+	x(journal_seq_blacklist,	8)	\
+	x(journal_v2,			9)	\
+	x(counters,			10)	\
+	x(members_v2,			11)	\
+	x(errors,			12)	\
+	x(ext,				13)	\
+	x(downgrade,			14)
+
+enum bch_sb_field_type {
+#define x(f, nr)	BCH_SB_FIELD_##f = nr,
+	BCH_SB_FIELDS()
+#undef x
+	BCH_SB_FIELD_NR
+};
+
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS		\
+	((1U << BCH_SB_FIELD_journal)|		\
+	 (1U << BCH_SB_FIELD_journal_v2))
+
+/* BCH_SB_FIELD_journal: */
+
+struct bch_sb_field_journal {
+	struct bch_sb_field	field;
+	__le64			buckets[];
+};
+
+struct bch_sb_field_journal_v2 {
+	struct bch_sb_field	field;
+
+	struct bch_sb_field_journal_v2_entry {
+		__le64		start;
+		__le64		nr;
+	}			d[];
+};
+
+/* BCH_SB_FIELD_members_v1: */
+
+#define BCH_MIN_NR_NBUCKETS	(1 << 6)
+
+#define BCH_IOPS_MEASUREMENTS()			\
+	x(seqread,	0)			\
+	x(seqwrite,	1)			\
+	x(randread,	2)			\
+	x(randwrite,	3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+	BCH_IOPS_MEASUREMENTS()
+#undef x
+	BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES()		\
+	x(read,		0)			\
+	x(write,	1)			\
+	x(checksum,	2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+	BCH_MEMBER_ERROR_TYPES()
+#undef x
+	BCH_MEMBER_ERROR_NR
+};
+
+struct bch_member {
+	__uuid_t		uuid;
+	__le64			nbuckets;	/* device size */
+	__le16			first_bucket;   /* index of first bucket used */
+	__le16			bucket_size;	/* sectors */
+	__le32			pad;
+	__le64			last_mount;	/* time_t */
+
+	__le64			flags;
+	__le32			iops[4];
+	__le64			errors[BCH_MEMBER_ERROR_NR];
+	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
+	__le64			errors_reset_time;
+};
+
+#define BCH_MEMBER_V1_BYTES	56
+
+LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags,  0,  4)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
+LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags, 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+					struct bch_member, flags, 30, 31)
+
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
+
+#define BCH_MEMBER_STATES()			\
+	x(rw,		0)			\
+	x(ro,		1)			\
+	x(failed,	2)			\
+	x(spare,	3)
+
+enum bch_member_state {
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+	BCH_MEMBER_STATES()
+#undef x
+	BCH_MEMBER_STATE_NR
+};
+
+struct bch_sb_field_members_v1 {
+	struct bch_sb_field	field;
+	struct bch_member	_members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
+	struct bch_sb_field	field;
+	__le16			member_bytes; //size of single member entry
+	u8			pad[6];
+	struct bch_member	_members[];
+};
+
+/* BCH_SB_FIELD_crypt: */
+
+struct nonce {
+	__le32			d[4];
+};
+
+struct bch_key {
+	__le64			key[4];
+};
+
+#define BCH_KEY_MAGIC					\
+	(((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|		\
+	 ((__u64) 'h' << 16)|((__u64) '*' << 24)|		\
+	 ((__u64) '*' << 32)|((__u64) 'k' << 40)|		\
+	 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
+
+struct bch_encrypted_key {
+	__le64			magic;
+	struct bch_key		key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+	struct bch_sb_field	field;
+
+	__le64			flags;
+	__le64			kdf_flags;
+	struct bch_encrypted_key key;
+};
+
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,	struct bch_sb_field_crypt, flags, 0, 4);
+
+enum bch_kdf_types {
+	BCH_KDF_SCRYPT		= 0,
+	BCH_KDF_NR		= 1,
+};
+
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N,	struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R,	struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P,	struct bch_sb_field_crypt, kdf_flags, 32, 48);
+
+/* BCH_SB_FIELD_replicas: */
+
+#define BCH_DATA_TYPES()		\
+	x(free,		0)		\
+	x(sb,		1)		\
+	x(journal,	2)		\
+	x(btree,	3)		\
+	x(user,		4)		\
+	x(cached,	5)		\
+	x(parity,	6)		\
+	x(stripe,	7)		\
+	x(need_gc_gens,	8)		\
+	x(need_discard,	9)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+	BCH_DATA_TYPES()
+#undef x
+	BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_free:
+	case BCH_DATA_need_gc_gens:
+	case BCH_DATA_need_discard:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+	switch (type) {
+	case BCH_DATA_sb:
+	case BCH_DATA_journal:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_replicas_entry_v0 {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			devs[];
+} __packed;
+
+struct bch_sb_field_replicas_v0 {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry_v0 entries[];
+} __packed __aligned(8);
+
+struct bch_replicas_entry {
+	__u8			data_type;
+	__u8			nr_devs;
+	__u8			nr_required;
+	__u8			devs[];
+} __packed;
+
+#define replicas_entry_bytes(_i)					\
+	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+
+struct bch_sb_field_replicas {
+	struct bch_sb_field	field;
+	struct bch_replicas_entry entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+	__le32				timelimit;
+	__le32				warnlimit;
+};
+
+struct bch_sb_quota_type {
+	__le64				flags;
+	struct bch_sb_quota_counter	c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+	struct bch_sb_field		field;
+	struct bch_sb_quota_type	q[QTYP_NR];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE		32
+
+struct bch_disk_group {
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			flags[2];
+} __packed __aligned(8);
+
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)
+
+struct bch_sb_field_disk_groups {
+	struct bch_sb_field	field;
+	struct bch_disk_group	entries[];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS()				\
+	x(io_read,					0)	\
+	x(io_write,					1)	\
+	x(io_move,					2)	\
+	x(bucket_invalidate,				3)	\
+	x(bucket_discard,				4)	\
+	x(bucket_alloc,					5)	\
+	x(bucket_alloc_fail,				6)	\
+	x(btree_cache_scan,				7)	\
+	x(btree_cache_reap,				8)	\
+	x(btree_cache_cannibalize,			9)	\
+	x(btree_cache_cannibalize_lock,			10)	\
+	x(btree_cache_cannibalize_lock_fail,		11)	\
+	x(btree_cache_cannibalize_unlock,		12)	\
+	x(btree_node_write,				13)	\
+	x(btree_node_read,				14)	\
+	x(btree_node_compact,				15)	\
+	x(btree_node_merge,				16)	\
+	x(btree_node_split,				17)	\
+	x(btree_node_rewrite,				18)	\
+	x(btree_node_alloc,				19)	\
+	x(btree_node_free,				20)	\
+	x(btree_node_set_root,				21)	\
+	x(btree_path_relock_fail,			22)	\
+	x(btree_path_upgrade_fail,			23)	\
+	x(btree_reserve_get_fail,			24)	\
+	x(journal_entry_full,				25)	\
+	x(journal_full,					26)	\
+	x(journal_reclaim_finish,			27)	\
+	x(journal_reclaim_start,			28)	\
+	x(journal_write,				29)	\
+	x(read_promote,					30)	\
+	x(read_bounce,					31)	\
+	x(read_split,					33)	\
+	x(read_retry,					32)	\
+	x(read_reuse_race,				34)	\
+	x(move_extent_read,				35)	\
+	x(move_extent_write,				36)	\
+	x(move_extent_finish,				37)	\
+	x(move_extent_fail,				38)	\
+	x(move_extent_start_fail,			39)	\
+	x(copygc,					40)	\
+	x(copygc_wait,					41)	\
+	x(gc_gens_end,					42)	\
+	x(gc_gens_start,				43)	\
+	x(trans_blocked_journal_reclaim,		44)	\
+	x(trans_restart_btree_node_reused,		45)	\
+	x(trans_restart_btree_node_split,		46)	\
+	x(trans_restart_fault_inject,			47)	\
+	x(trans_restart_iter_upgrade,			48)	\
+	x(trans_restart_journal_preres_get,		49)	\
+	x(trans_restart_journal_reclaim,		50)	\
+	x(trans_restart_journal_res_get,		51)	\
+	x(trans_restart_key_cache_key_realloced,	52)	\
+	x(trans_restart_key_cache_raced,		53)	\
+	x(trans_restart_mark_replicas,			54)	\
+	x(trans_restart_mem_realloced,			55)	\
+	x(trans_restart_memory_allocation_failure,	56)	\
+	x(trans_restart_relock,				57)	\
+	x(trans_restart_relock_after_fill,		58)	\
+	x(trans_restart_relock_key_cache_fill,		59)	\
+	x(trans_restart_relock_next_node,		60)	\
+	x(trans_restart_relock_parent_for_fill,		61)	\
+	x(trans_restart_relock_path,			62)	\
+	x(trans_restart_relock_path_intent,		63)	\
+	x(trans_restart_too_many_iters,			64)	\
+	x(trans_restart_traverse,			65)	\
+	x(trans_restart_upgrade,			66)	\
+	x(trans_restart_would_deadlock,			67)	\
+	x(trans_restart_would_deadlock_write,		68)	\
+	x(trans_restart_injected,			69)	\
+	x(trans_restart_key_cache_upgrade,		70)	\
+	x(trans_traverse_all,				71)	\
+	x(transaction_commit,				72)	\
+	x(write_super,					73)	\
+	x(trans_restart_would_deadlock_recursion_limit,	74)	\
+	x(trans_restart_write_buffer_flush,		75)	\
+	x(trans_restart_split_race,			76)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+	struct bch_sb_field	field;
+	__le64			d[];
+};
+
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+	__le16			u64s;
+	__u8			btree_id;
+	__u8			level;
+	__u8			type; /* designates what this jset holds */
+	__u8			pad[3];
+
+	struct bkey_i		start[0];
+	__u64			_data[];
+};
+
+struct bch_sb_field_clean {
+	struct bch_sb_field	field;
+
+	__le32			flags;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
+	__le64			journal_seq;
+
+	struct jset_entry	start[0];
+	__u64			_data[];
+};
+
+struct journal_seq_blacklist_entry {
+	__le64			start;
+	__le64			end;
+};
+
+struct bch_sb_field_journal_seq_blacklist {
+	struct bch_sb_field	field;
+	struct journal_seq_blacklist_entry start[];
+};
+
+struct bch_sb_field_errors {
+	struct bch_sb_field	field;
+	struct bch_sb_field_error_entry {
+		__le64		v;
+		__le64		last_error_time;
+	}			entries[];
+};
+
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,	struct bch_sb_field_error_entry, v,  0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,	struct bch_sb_field_error_entry, v, 16, 64);
+
+struct bch_sb_field_ext {
+	struct bch_sb_field	field;
+	__le64			recovery_passes_required[2];
+	__le64			errors_silent[8];
+};
+
+struct bch_sb_field_downgrade_entry {
+	__le16			version;
+	__le64			recovery_passes[2];
+	__le16			nr_errors;
+	__le16			errors[] __counted_by(nr_errors);
+} __packed __aligned(2);
+
+struct bch_sb_field_downgrade {
+	struct bch_sb_field	field;
+	struct bch_sb_field_downgrade_entry entries[];
+};
+
+/* Superblock: */
+
+/*
+ * New versioning scheme:
+ * One common version number for all on disk data structures - superblock, btree
+ * nodes, journal entries
+ */
+#define BCH_VERSION_MAJOR(_v)		((__u16) ((_v) >> 10))
+#define BCH_VERSION_MINOR(_v)		((__u16) ((_v) & ~(~0U << 10)))
+#define BCH_VERSION(_major, _minor)	(((_major) << 10)|(_minor) << 0)
+
+#define RECOVERY_PASS_ALL_FSCK		(1ULL << 63)
+
+/*
+ * field 1:		version name
+ * field 2:		BCH_VERSION(major, minor)
+ * field 3:		recovery passess required on upgrade
+ */
+#define BCH_METADATA_VERSIONS()						\
+	x(bkey_renumber,		BCH_VERSION(0, 10),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_btree_change,		BCH_VERSION(0, 11),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot,			BCH_VERSION(0, 12),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_backpointers,		BCH_VERSION(0, 13),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(btree_ptr_sectors_written,	BCH_VERSION(0, 14),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_2,			BCH_VERSION(0, 15),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)|		\
+	  BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)|		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(reflink_p_fix,		BCH_VERSION(0, 16),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p))			\
+	x(subvol_dirent,		BCH_VERSION(0, 17),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v2,			BCH_VERSION(0, 18),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(freespace,			BCH_VERSION(0, 19),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(alloc_v4,			BCH_VERSION(0, 20),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(new_data_types,		BCH_VERSION(0, 21),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(backpointers,			BCH_VERSION(0, 22),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(inode_v3,			BCH_VERSION(0, 23),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(unwritten_extents,		BCH_VERSION(0, 24),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(bucket_gens,			BCH_VERSION(0, 25),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)|			\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(lru_v2,			BCH_VERSION(0, 26),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(fragmentation_lru,		BCH_VERSION(0, 27),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(no_bps_in_alloc_keys,		BCH_VERSION(0, 28),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(snapshot_trees,		BCH_VERSION(0, 29),		\
+	  RECOVERY_PASS_ALL_FSCK)					\
+	x(major_minor,			BCH_VERSION(1,  0),		\
+	  0)								\
+	x(snapshot_skiplists,		BCH_VERSION(1,  1),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))			\
+	x(deleted_inodes,		BCH_VERSION(1,  2),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes))			\
+	x(rebalance_work,		BCH_VERSION(1,  3),		\
+	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+
+enum bcachefs_metadata_version {
+	bcachefs_metadata_version_min = 9,
+#define x(t, n, upgrade_passes)	bcachefs_metadata_version_##t = n,
+	BCH_METADATA_VERSIONS()
+#undef x
+	bcachefs_metadata_version_max
+};
+
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
+
+#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
+
+#define BCH_SB_SECTOR			8
+#define BCH_SB_MEMBERS_MAX		64 /* XXX kill */
+
+struct bch_sb_layout {
+	__uuid_t		magic;	/* bcachefs superblock UUID */
+	__u8			layout_type;
+	__u8			sb_max_size_bits; /* base 2 of 512 byte sectors */
+	__u8			nr_superblocks;
+	__u8			pad[5];
+	__le64			sb_offset[61];
+} __packed __aligned(8);
+
+#define BCH_SB_LAYOUT_SECTOR	7
+
+/*
+ * @offset	- sector where this sb was written
+ * @version	- on disk format version
+ * @version_min	- Oldest metadata version this filesystem contains; so we can
+ *		  safely drop compatibility code and refuse to mount filesystems
+ *		  we'd need it for
+ * @magic	- identifies as a bcachefs superblock (BCHFS_MAGIC)
+ * @seq		- incremented each time superblock is written
+ * @uuid	- used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid	- user visible UUID, may be changed
+ * @label	- filesystem label
+ * @seq		- identifies most recent superblock, incremented each time
+ *		  superblock is written
+ * @features	- enabled incompatible features
+ */
+struct bch_sb {
+	struct bch_csum		csum;
+	__le16			version;
+	__le16			version_min;
+	__le16			pad[2];
+	__uuid_t		magic;
+	__uuid_t		uuid;
+	__uuid_t		user_uuid;
+	__u8			label[BCH_SB_LABEL_SIZE];
+	__le64			offset;
+	__le64			seq;
+
+	__le16			block_size;
+	__u8			dev_idx;
+	__u8			nr_devices;
+	__le32			u64s;
+
+	__le64			time_base_lo;
+	__le32			time_base_hi;
+	__le32			time_precision;
+
+	__le64			flags[8];
+	__le64			features[2];
+	__le64			compat[2];
+
+	struct bch_sb_layout	layout;
+
+	struct bch_sb_field	start[0];
+	__le64			_data[];
+} __packed __aligned(8);
+
+/*
+ * Flags:
+ * BCH_SB_INITALIZED	- set on first mount
+ * BCH_SB_CLEAN		- did we shut down cleanly? Just a hint, doesn't affect
+ *			  behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT	- limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS	- 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *			   DATA/META_CSUM_TYPE. Also indicates encryption
+ *			   algorithm in use, if/when we get more than one
+ */
+
+LE16_BITMASK(BCH_SB_BLOCK_SIZE,		struct bch_sb, block_size, 0, 16);
+
+LE64_BITMASK(BCH_SB_INITIALIZED,	struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,		struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,		struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,	struct bch_sb, flags[0],  8, 12);
+
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,	struct bch_sb, flags[0], 12, 28);
+
+LE64_BITMASK(BCH_SB_GC_RESERVE,		struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,	struct bch_sb, flags[0], 33, 40);
+
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,	struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,	struct bch_sb, flags[0], 44, 48);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,	struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,	struct bch_sb, flags[0], 52, 56);
+
+LE64_BITMASK(BCH_SB_POSIX_ACL,		struct bch_sb, flags[0], 56, 57);
+LE64_BITMASK(BCH_SB_USRQUOTA,		struct bch_sb, flags[0], 57, 58);
+LE64_BITMASK(BCH_SB_GRPQUOTA,		struct bch_sb, flags[0], 58, 59);
+LE64_BITMASK(BCH_SB_PRJQUOTA,		struct bch_sb, flags[0], 59, 60);
+
+LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
+
+LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
+
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,	struct bch_sb, flags[1],  8,  9);
+
+LE64_BITMASK(BCH_SB_128_BIT_MACS,	struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,	struct bch_sb, flags[1], 10, 14);
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
+					struct bch_sb, flags[1], 14, 20);
+
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
+
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
+					struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);
+
+LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,	struct bch_sb, flags[4], 34, 54);
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE,	struct bch_sb, flags[4], 54, 56);
+
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
+					struct bch_sb, flags[4], 60, 64);
+
+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
+					struct bch_sb, flags[5],  0, 16);
+
+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
+{
+	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
+		(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
+}
+
+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
+{
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
+}
+
+/*
+ * Features:
+ *
+ * journal_seq_blacklist_v3:	gates BCH_SB_FIELD_journal_seq_blacklist
+ * reflink:			gates KEY_TYPE_reflink
+ * inline_data:			gates KEY_TYPE_inline_data
+ * new_siphash:			gates BCH_STR_HASH_siphash
+ * new_extent_overwrite:	gates BTREE_NODE_NEW_EXTENT_OVERWRITE
+ */
+#define BCH_SB_FEATURES()			\
+	x(lz4,				0)	\
+	x(gzip,				1)	\
+	x(zstd,				2)	\
+	x(atomic_nlink,			3)	\
+	x(ec,				4)	\
+	x(journal_seq_blacklist_v3,	5)	\
+	x(reflink,			6)	\
+	x(new_siphash,			7)	\
+	x(inline_data,			8)	\
+	x(new_extent_overwrite,		9)	\
+	x(incompressible,		10)	\
+	x(btree_ptr_v2,			11)	\
+	x(extents_above_btree_updates,	12)	\
+	x(btree_updates_journalled,	13)	\
+	x(reflink_inline_data,		14)	\
+	x(new_varint,			15)	\
+	x(journal_no_flush,		16)	\
+	x(alloc_v2,			17)	\
+	x(extents_across_btree_nodes,	18)
+
+#define BCH_SB_FEATURES_ALWAYS				\
+	((1ULL << BCH_FEATURE_new_extent_overwrite)|	\
+	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+	 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+	 (1ULL << BCH_FEATURE_alloc_v2)|\
+	 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
+
+#define BCH_SB_FEATURES_ALL				\
+	(BCH_SB_FEATURES_ALWAYS|			\
+	 (1ULL << BCH_FEATURE_new_siphash)|		\
+	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
+	 (1ULL << BCH_FEATURE_new_varint)|		\
+	 (1ULL << BCH_FEATURE_journal_no_flush))
+
+enum bch_sb_feature {
+#define x(f, n) BCH_FEATURE_##f,
+	BCH_SB_FEATURES()
+#undef x
+	BCH_FEATURE_NR,
+};
+
+#define BCH_SB_COMPAT()					\
+	x(alloc_info,				0)	\
+	x(alloc_metadata,			1)	\
+	x(extents_above_btree_updates_done,	2)	\
+	x(bformat_overflow_done,		3)
+
+enum bch_sb_compat {
+#define x(f, n) BCH_COMPAT_##f,
+	BCH_SB_COMPAT()
+#undef x
+	BCH_COMPAT_NR,
+};
+
+/* options: */
+
+#define BCH_VERSION_UPGRADE_OPTS()	\
+	x(compatible,		0)	\
+	x(incompatible,		1)	\
+	x(none,			2)
+
+enum bch_version_upgrade_opts {
+#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
+	BCH_VERSION_UPGRADE_OPTS()
+#undef x
+};
+
+#define BCH_REPLICAS_MAX		4U
+
+#define BCH_BKEY_PTRS_MAX		16U
+
+#define BCH_ERROR_ACTIONS()		\
+	x(continue,		0)	\
+	x(ro,			1)	\
+	x(panic,		2)
+
+enum bch_error_actions {
+#define x(t, n) BCH_ON_ERROR_##t = n,
+	BCH_ERROR_ACTIONS()
+#undef x
+	BCH_ON_ERROR_NR
+};
+
+#define BCH_STR_HASH_TYPES()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash_old,		2)	\
+	x(siphash,		3)
+
+enum bch_str_hash_type {
+#define x(t, n) BCH_STR_HASH_##t = n,
+	BCH_STR_HASH_TYPES()
+#undef x
+	BCH_STR_HASH_NR
+};
+
+#define BCH_STR_HASH_OPTS()		\
+	x(crc32c,		0)	\
+	x(crc64,		1)	\
+	x(siphash,		2)
+
+enum bch_str_hash_opts {
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+	BCH_STR_HASH_OPTS()
+#undef x
+	BCH_STR_HASH_OPT_NR
+};
+
+#define BCH_CSUM_TYPES()			\
+	x(none,				0)	\
+	x(crc32c_nonzero,		1)	\
+	x(crc64_nonzero,		2)	\
+	x(chacha20_poly1305_80,		3)	\
+	x(chacha20_poly1305_128,	4)	\
+	x(crc32c,			5)	\
+	x(crc64,			6)	\
+	x(xxhash,			7)
+
+enum bch_csum_type {
+#define x(t, n) BCH_CSUM_##t = n,
+	BCH_CSUM_TYPES()
+#undef x
+	BCH_CSUM_NR
+};
+
+static const __maybe_unused unsigned bch_crc_bytes[] = {
+	[BCH_CSUM_none]				= 0,
+	[BCH_CSUM_crc32c_nonzero]		= 4,
+	[BCH_CSUM_crc32c]			= 4,
+	[BCH_CSUM_crc64_nonzero]		= 8,
+	[BCH_CSUM_crc64]			= 8,
+	[BCH_CSUM_xxhash]			= 8,
+	[BCH_CSUM_chacha20_poly1305_80]		= 10,
+	[BCH_CSUM_chacha20_poly1305_128]	= 16,
+};
+
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
+{
+	switch (type) {
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+#define BCH_CSUM_OPTS()			\
+	x(none,			0)	\
+	x(crc32c,		1)	\
+	x(crc64,		2)	\
+	x(xxhash,		3)
+
+enum bch_csum_opts {
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+	BCH_CSUM_OPTS()
+#undef x
+	BCH_CSUM_OPT_NR
+};
+
+#define BCH_COMPRESSION_TYPES()		\
+	x(none,			0)	\
+	x(lz4_old,		1)	\
+	x(gzip,			2)	\
+	x(lz4,			3)	\
+	x(zstd,			4)	\
+	x(incompressible,	5)
+
+enum bch_compression_type {
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
+	BCH_COMPRESSION_TYPES()
+#undef x
+	BCH_COMPRESSION_TYPE_NR
+};
+
+#define BCH_COMPRESSION_OPTS()		\
+	x(none,		0)		\
+	x(lz4,		1)		\
+	x(gzip,		2)		\
+	x(zstd,		3)
+
+enum bch_compression_opts {
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
+	BCH_COMPRESSION_OPTS()
+#undef x
+	BCH_COMPRESSION_OPT_NR
+};
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define BCACHE_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,				\
+		  0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC							\
+	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
+		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
+#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+
+#define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
+#define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
+
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
+{
+	__le64 ret;
+
+	memcpy(&ret, &sb->uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 __jset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
+}
+
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
+}
+
+/* Journal */
+
+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)		\
+	x(usage,		5)		\
+	x(data_usage,		6)		\
+	x(clock,		7)		\
+	x(dev_usage,		8)		\
+	x(log,			9)		\
+	x(overwrite,		10)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
+struct jset_entry_blacklist {
+	struct jset_entry	entry;
+	__le64			seq;
+};
+
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
+};
+
+#define BCH_FS_USAGE_TYPES()			\
+	x(reserved,		0)		\
+	x(inodes,		1)		\
+	x(key_version,		2)
+
+enum {
+#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
+	BCH_FS_USAGE_TYPES()
+#undef x
+	BCH_FS_USAGE_NR
+};
+
+struct jset_entry_usage {
+	struct jset_entry	entry;
+	__le64			v;
+} __packed;
+
+struct jset_entry_data_usage {
+	struct jset_entry	entry;
+	__le64			v;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __packed;
+
+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __packed;
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			buckets_ec;
+	__le64			_buckets_unavailable; /* No longer used */
+
+	struct jset_entry_dev_usage_type d[];
+};
+
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+		sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+	struct jset_entry	entry;
+	u8			d[];
+} __packed;
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+	struct bch_csum		csum;
+
+	__le64			magic;
+	__le64			seq;
+	__le32			version;
+	__le32			flags;
+
+	__le32			u64s; /* size of d[] in u64s */
+
+	__u8			encrypted_start[0];
+
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
+
+	/* Sequence number of oldest dirty journal entry */
+	__le64			last_seq;
+
+
+	struct jset_entry	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
+LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,	struct jset, flags, 5, 6);
+
+#define BCH_JOURNAL_BUCKETS_MIN		8
+
+/* Btree: */
+
+enum btree_id_flags {
+	BTREE_ID_EXTENTS	= BIT(0),
+	BTREE_ID_SNAPSHOTS	= BIT(1),
+	BTREE_ID_SNAPSHOT_FIELD	= BIT(2),
+	BTREE_ID_DATA		= BIT(3),
+};
+
+#define BCH_BTREE_IDS()								\
+	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_error)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_extent)|						\
+	  BIT_ULL(KEY_TYPE_reservation)|					\
+	  BIT_ULL(KEY_TYPE_reflink_p)|						\
+	  BIT_ULL(KEY_TYPE_inline_data))					\
+	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_inode)|						\
+	  BIT_ULL(KEY_TYPE_inode_v2)|						\
+	  BIT_ULL(KEY_TYPE_inode_v3)|						\
+	  BIT_ULL(KEY_TYPE_inode_generation))					\
+	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_dirent))						\
+	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
+	  BIT_ULL(KEY_TYPE_whiteout)|						\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
+	  BIT_ULL(KEY_TYPE_xattr))						\
+	x(alloc,		4,	0,					\
+	  BIT_ULL(KEY_TYPE_alloc)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
+	  BIT_ULL(KEY_TYPE_alloc_v4))						\
+	x(quotas,		5,	0,					\
+	  BIT_ULL(KEY_TYPE_quota))						\
+	x(stripes,		6,	0,					\
+	  BIT_ULL(KEY_TYPE_stripe))						\
+	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
+	  BIT_ULL(KEY_TYPE_reflink_v)|						\
+	  BIT_ULL(KEY_TYPE_indirect_inline_data))				\
+	x(subvolumes,		8,	0,					\
+	  BIT_ULL(KEY_TYPE_subvolume))						\
+	x(snapshots,		9,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot))						\
+	x(lru,			10,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(freespace,		11,	BTREE_ID_EXTENTS,			\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(need_discard,		12,	0,					\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(backpointers,		13,	0,					\
+	  BIT_ULL(KEY_TYPE_backpointer))					\
+	x(bucket_gens,		14,	0,					\
+	  BIT_ULL(KEY_TYPE_bucket_gens))					\
+	x(snapshot_trees,	15,	0,					\
+	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
+	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(logged_ops,		17,	0,					\
+	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
+	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
+	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
+	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+
+enum btree_id {
+#define x(name, nr, ...) BTREE_ID_##name = nr,
+	BCH_BTREE_IDS()
+#undef x
+	BTREE_ID_NR
+};
+
+#define BTREE_MAX_DEPTH		4U
+
+/* Btree nodes */
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__le64			seq;
+
+	/*
+	 * Highest journal entry this bset contains keys for.
+	 * If on recovery we don't see that journal entry, this bset is ignored:
+	 * this allows us to preserve the order of all index updates after a
+	 * crash, since the journal records a total order of all index updates
+	 * and anything that didn't make it to the journal doesn't get used.
+	 */
+	__le64			journal_seq;
+
+	__le32			flags;
+	__le16			version;
+	__le16			u64s; /* count of d[] in u64s */
+
+	struct bkey_packed	start[0];
+	__u64			_data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
+
+LE32_BITMASK(BSET_BIG_ENDIAN,	struct bset, flags, 4, 5);
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
+				struct bset, flags, 5, 6);
+
+/* Sector offset within the btree node: */
+LE32_BITMASK(BSET_OFFSET,	struct bset, flags, 16, 32);
+
+struct btree_node {
+	struct bch_csum		csum;
+	__le64			magic;
+
+	/* this flags field is encrypted, unlike bset->flags: */
+	__le64			flags;
+
+	/* Closed interval: */
+	struct bpos		min_key;
+	struct bpos		max_key;
+	struct bch_extent_ptr	_ptr; /* not used anymore */
+	struct bkey_format	format;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+
+	};
+	};
+} __packed __aligned(8);
+
+LE64_BITMASK(BTREE_NODE_ID_LO,	struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+				struct btree_node, flags,  8,  9);
+LE64_BITMASK(BTREE_NODE_ID_HI,	struct btree_node, flags,  9, 25);
+/* 25-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,	struct btree_node, flags, 32, 64);
+
+static inline __u64 BTREE_NODE_ID(struct btree_node *n)
+{
+	return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
+}
+
+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
+{
+	SET_BTREE_NODE_ID_LO(n, v);
+	SET_BTREE_NODE_ID_HI(n, v >> 4);
+}
+
+struct btree_node_entry {
+	struct bch_csum		csum;
+
+	union {
+	struct bset		keys;
+	struct {
+		__u8		pad[22];
+		__le16		u64s;
+		__u64		_data[0];
+	};
+	};
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_FORMAT_H */
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
new file mode 100644
index 000000000000..f05881f7e113
--- /dev/null
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_LOST			\
+	(BCH_FORCE_IF_DATA_LOST|		\
+	 BCH_FORCE_IF_METADATA_LOST)
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_FS_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_fs_usage)
+#define BCH_IOCTL_DEV_USAGE	_IOWR(0xbc,	11, struct bch_ioctl_dev_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	14,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
+
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,	16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc,	17,  struct bch_ioctl_subvolume)
+
+/* ioctl below act on a particular file, not the filesystem as a whole: */
+
+#define BCHFS_IOC_REINHERIT_ATTRS	_IOR(0xbc, 64, const char __user *)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	__uuid_t		uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB		= 0,
+	BCH_DATA_OP_REREPLICATE		= 1,
+	BCH_DATA_OP_MIGRATE		= 2,
+	BCH_DATA_OP_REWRITE_OLD_NODES	= 3,
+	BCH_DATA_OP_NR			= 4,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u16			op;
+	__u8			start_btree;
+	__u8			end_btree;
+	__u32			flags;
+
+	struct bpos		start_pos;
+	struct bpos		end_pos;
+
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __packed __aligned(8);
+
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __packed __aligned(8);
+
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __packed __aligned(8);
+
+struct bch_replicas_usage {
+	__u64			sectors;
+	struct bch_replicas_entry r;
+} __packed;
+
+static inline struct bch_replicas_usage *
+replicas_usage_next(struct bch_replicas_usage *u)
+{
+	return (void *) u + replicas_entry_bytes(&u->r) + 8;
+}
+
+/*
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+
+	__u32			replica_entries_bytes;
+	__u32			pad;
+
+	struct bch_replicas_usage replicas[0];
+};
+
+/*
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
+ *
+ * Returns disk space usage broken out by data type - both by buckets and
+ * sectors.
+ */
+struct bch_ioctl_dev_usage {
+	__u64			dev;
+	__u32			flags;
+	__u8			state;
+	__u8			pad[7];
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets_ec;
+
+	struct bch_ioctl_dev_usage_type {
+		__u64		buckets;
+		__u64		sectors;
+		__u64		fragmented;
+	}			d[BCH_DATA_NR];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+struct bch_ioctl_subvolume {
+	__u32			flags;
+	__u32			dirfd;
+	__u16			mode;
+	__u16			pad[3];
+	__u64			dst_ptr;
+	__u64			src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE	(1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO		(1U << 1)
+
+#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
new file mode 100644
index 000000000000..abdb05507d16
--- /dev/null
+++ b/fs/bcachefs/bkey.c
@@ -0,0 +1,1120 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_cmp.h"
+#include "bkey_methods.h"
+#include "bset.h"
+#include "util.h"
+
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *out,
+				     const struct bkey_format *f,
+				     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(f, k);
+	unsigned word_bits = 64 - high_bit_offset;
+	unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset;
+	u64 v = *p & (~0ULL >> high_bit_offset);
+
+	if (!nr_key_bits) {
+		prt_str(out, "(empty)");
+		return;
+	}
+
+	while (1) {
+		unsigned next_key_bits = nr_key_bits;
+
+		if (nr_key_bits < 64) {
+			v >>= 64 - nr_key_bits;
+			next_key_bits = 0;
+		} else {
+			next_key_bits -= 64;
+		}
+
+		bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+
+		if (!next_key_bits)
+			break;
+
+		prt_char(out, ' ');
+
+		p = next_word(p);
+		v = *p;
+		word_bits = 64;
+		nr_key_bits = next_key_bits;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+				  const struct bkey *unpacked,
+				  const struct bkey_format *format)
+{
+	struct bkey tmp;
+
+	BUG_ON(bkeyp_val_u64s(format, packed) !=
+	       bkey_val_u64s(unpacked));
+
+	BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
+
+	tmp = __bch2_bkey_unpack_key(format, packed);
+
+	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n",
+		      format->key_u64s,
+		      format->bits_per_field[0],
+		      format->bits_per_field[1],
+		      format->bits_per_field[2],
+		      format->bits_per_field[3],
+		      format->bits_per_field[4]);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_to_text(&buf, unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_to_text(&buf, &tmp);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "compiled unpack: ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) unpacked);
+		prt_newline(&buf);
+
+		prt_printf(&buf, "c unpack:        ");
+		bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current,
+						(struct bkey_packed *) &tmp);
+		prt_newline(&buf);
+
+		panic("%s", buf.buf);
+	}
+}
+
+#else
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
+					const struct bkey *unpacked,
+					const struct bkey_format *format) {}
+#endif
+
+struct pack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	u64			*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct pack_state pack_state_init(const struct bkey_format *format,
+					 struct bkey_packed *k)
+{
+	u64 *p = high_word(format, k);
+
+	return (struct pack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= 0,
+		.p	= p,
+	};
+}
+
+__always_inline
+static void pack_state_finish(struct pack_state *state,
+			      struct bkey_packed *k)
+{
+	EBUG_ON(state->p <  k->_data);
+	EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
+
+	*state->p = state->w;
+}
+
+struct unpack_state {
+	const struct bkey_format *format;
+	unsigned		bits;	/* bits remaining in current word */
+	u64			w;	/* current word */
+	const u64		*p;	/* pointer to next word */
+};
+
+__always_inline
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
+					     const struct bkey_packed *k)
+{
+	const u64 *p = high_word(format, k);
+
+	return (struct unpack_state) {
+		.format	= format,
+		.bits	= 64 - high_bit_offset,
+		.w	= *p << high_bit_offset,
+		.p	= p,
+	};
+}
+
+__always_inline
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (bits >= state->bits) {
+		v = state->w >> (64 - bits);
+		bits -= state->bits;
+
+		state->p = next_word(state->p);
+		state->w = *state->p;
+		state->bits = 64;
+	}
+
+	/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
+	v |= (state->w >> 1) >> (63 - bits);
+	state->w <<= bits;
+	state->bits -= bits;
+
+	return v + offset;
+}
+
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+
+	if (bits) {
+		if (bits > state->bits) {
+			bits -= state->bits;
+			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+			state->w |= (v >> 1) >> (bits - 1);
+
+			*state->p = state->w;
+			state->p = next_word(state->p);
+			state->w = 0;
+			state->bits = 64;
+		}
+
+		state->bits -= bits;
+		state->w |= v << state->bits;
+	}
+}
+
+__always_inline
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+
+	if (v < offset)
+		return false;
+
+	v -= offset;
+
+	if (fls64(v) > bits)
+		return false;
+
+	__set_inc_field(state, field, v);
+	return true;
+}
+
+/*
+ * Note: does NOT set out->format (we don't know what it should be here!)
+ *
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
+ * if k is packed bkey_start_pos(k) will successfully pack
+ */
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
+				   struct bkey_packed *out,
+				   const struct bkey_format *in_f,
+				   const struct bkey_packed *in)
+{
+	struct pack_state out_s = pack_state_init(out_f, out);
+	struct unpack_state in_s = unpack_state_init(in_f, in);
+	u64 *w = out->_data;
+	unsigned i;
+
+	*w = 0;
+
+	for (i = 0; i < BKEY_NR_FIELDS; i++)
+		if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
+			return false;
+
+	/* Can't happen because the val would be too big to unpack: */
+	EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
+
+	pack_state_finish(&out_s, out);
+	out->u64s	= out_f->key_u64s + in->u64s - in_f->key_u64s;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	return true;
+}
+
+bool bch2_bkey_transform(const struct bkey_format *out_f,
+			struct bkey_packed *out,
+			const struct bkey_format *in_f,
+			const struct bkey_packed *in)
+{
+	if (!bch2_bkey_transform_key(out_f, out, in_f, in))
+		return false;
+
+	memcpy_u64s((u64 *) out + out_f->key_u64s,
+		    (u64 *) in + in_f->key_u64s,
+		    (in->u64s - in_f->key_u64s));
+	return true;
+}
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
+			      const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bkey out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+	EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
+
+	out.u64s	= BKEY_U64s + in->u64s - format->key_u64s;
+	out.format	= KEY_FORMAT_CURRENT;
+	out.needs_whiteout = in->needs_whiteout;
+	out.type	= in->type;
+	out.pad[0]	= 0;
+
+#define x(id, field)	out.field = get_inc_field(&state, id);
+	bkey_fields()
+#undef x
+
+	return out;
+}
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
+				     const struct bkey_packed *in)
+{
+	struct unpack_state state = unpack_state_init(format, in);
+	struct bpos out;
+
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->u64s < format->key_u64s);
+	EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
+
+	out.inode	= get_inc_field(&state, BKEY_FIELD_INODE);
+	out.offset	= get_inc_field(&state, BKEY_FIELD_OFFSET);
+	out.snapshot	= get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
+
+	return out;
+}
+#endif
+
+/**
+ * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:	packed result
+ * @in:		key to pack
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
+			const struct bkey_format *format)
+{
+	struct pack_state state = pack_state_init(format, out);
+	u64 *w = out->_data;
+
+	EBUG_ON((void *) in == (void *) out);
+	EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
+	EBUG_ON(in->format != KEY_FORMAT_CURRENT);
+
+	*w = 0;
+
+#define x(id, field)	if (!set_inc_field(&state, id, in->field)) return false;
+	bkey_fields()
+#undef x
+	pack_state_finish(&state, out);
+	out->u64s	= format->key_u64s + in->u64s - BKEY_U64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->needs_whiteout = in->needs_whiteout;
+	out->type	= in->type;
+
+	bch2_bkey_pack_verify(out, in, format);
+	return true;
+}
+
+/**
+ * bch2_bkey_unpack -- unpack the key and the value
+ * @b:		btree node of @src key (for packed format)
+ * @dst:	unpacked result
+ * @src:	packed input
+ */
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
+		      const struct bkey_packed *src)
+{
+	__bkey_unpack_key(b, &dst->k, src);
+
+	memcpy_u64s(&dst->v,
+		    bkeyp_val(&b->format, src),
+		    bkeyp_val_u64s(&b->format, src));
+}
+
+/**
+ * bch2_bkey_pack -- pack the key and the value
+ * @dst:	packed result
+ * @src:	unpacked input
+ * @format:	format of packed result
+ *
+ * Returns: true on success, false on failure
+ */
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+		    const struct bkey_format *format)
+{
+	struct bkey_packed tmp;
+
+	if (!bch2_bkey_pack_key(&tmp, &src->k, format))
+		return false;
+
+	memmove_u64s((u64 *) dst + format->key_u64s,
+		     &src->v,
+		     bkey_val_u64s(&src->k));
+	memcpy_u64s_small(dst, &tmp, format->key_u64s);
+
+	return true;
+}
+
+__always_inline
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
+{
+	unsigned bits = state->format->bits_per_field[field];
+	u64 offset = le64_to_cpu(state->format->field_offset[field]);
+	bool ret = true;
+
+	EBUG_ON(v < offset);
+	v -= offset;
+
+	if (fls64(v) > bits) {
+		v = ~(~0ULL << bits);
+		ret = false;
+	}
+
+	__set_inc_field(state, field, v);
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool bkey_packed_successor(struct bkey_packed *out,
+				  const struct btree *b,
+				  struct bkey_packed k)
+{
+	const struct bkey_format *f = &b->format;
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned first_bit, offset;
+	u64 *p;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	if (!nr_key_bits)
+		return false;
+
+	*out = k;
+
+	first_bit = high_bit_offset + nr_key_bits - 1;
+	p = nth_word(high_word(f, out), first_bit >> 6);
+	offset = 63 - (first_bit & 63);
+
+	while (nr_key_bits) {
+		unsigned bits = min(64 - offset, nr_key_bits);
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if ((*p & mask) != mask) {
+			*p += 1ULL << offset;
+			EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
+			return true;
+		}
+
+		*p &= ~mask;
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		offset = 0;
+	}
+
+	return false;
+}
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+	for (unsigned i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 packed_max = f->bits_per_field[i]
+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+			: 0;
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (packed_max + field_offset < packed_max ||
+		    packed_max + field_offset > unpacked_max)
+			return true;
+	}
+
+	return false;
+}
+#endif
+
+/*
+ * Returns a packed key that compares <= in
+ *
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
+ * able to compare against the keys in the auxiliary search tree - and it's
+ * legal to use a packed pos that isn't equivalent to the original pos,
+ * _provided_ it compares <= to the original pos.
+ */
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
+					   struct bpos in,
+					   const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	struct pack_state state = pack_state_init(f, out);
+	u64 *w = out->_data;
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos orig = in;
+#endif
+	bool exact = true;
+	unsigned i;
+
+	/*
+	 * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3
+	 * byte header, but pack_pos() won't if the len/version fields are big
+	 * enough - we need to make sure to zero them out:
+	 */
+	for (i = 0; i < f->key_u64s; i++)
+		w[i] = 0;
+
+	if (unlikely(in.snapshot <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
+		if (!in.offset-- &&
+		    !in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.offset <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
+		if (!in.inode--)
+			return BKEY_PACK_POS_FAIL;
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(in.inode <
+		     le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
+		return BKEY_PACK_POS_FAIL;
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) {
+		in.offset	= KEY_OFFSET_MAX;
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) {
+		in.snapshot	= KEY_SNAPSHOT_MAX;
+		exact = false;
+	}
+
+	if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)))
+		exact = false;
+
+	pack_state_finish(&state, out);
+	out->u64s	= f->key_u64s;
+	out->format	= KEY_FORMAT_LOCAL_BTREE;
+	out->type	= KEY_TYPE_deleted;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (exact) {
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+	} else {
+		struct bkey_packed successor;
+
+		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+		       !bkey_format_has_too_big_fields(f));
+	}
+#endif
+
+	return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
+}
+
+void bch2_bkey_format_init(struct bkey_format_state *s)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
+		s->field_min[i] = U64_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
+		s->field_max[i] = 0;
+
+	/* Make sure we can store a size of 0: */
+	s->field_min[BKEY_FIELD_SIZE] = 0;
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
+{
+	unsigned field = 0;
+
+	__bkey_format_add(s, field++, p.inode);
+	__bkey_format_add(s, field++, p.offset);
+	__bkey_format_add(s, field++, p.snapshot);
+}
+
+/*
+ * We don't want it to be possible for the packed format to represent fields
+ * bigger than a u64... that will cause confusion and issues (like with
+ * bkey_packed_successor())
+ */
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
+			     unsigned bits, u64 offset)
+{
+	unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+	u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+	bits = min(bits, unpacked_bits);
+
+	offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
+
+	f->bits_per_field[i]	= bits;
+	f->field_offset[i]	= cpu_to_le64(offset);
+}
+
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+	struct bkey_format ret = {
+		.nr_fields = BKEY_NR_FIELDS,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
+		s->field_min[i] = min(s->field_min[i], s->field_max[i]);
+
+		set_format_field(&ret, i,
+				 fls64(s->field_max[i] - s->field_min[i]),
+				 s->field_min[i]);
+
+		bits += ret.bits_per_field[i];
+	}
+
+	/* allow for extent merging: */
+	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+		bits += b;
+	}
+
+	ret.key_u64s = DIV_ROUND_UP(bits, 64);
+
+	/* if we have enough spare bits, round fields up to nearest byte */
+	bits = ret.key_u64s * 64 - bits;
+
+	for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
+		unsigned r = round_up(ret.bits_per_field[i], 8) -
+			ret.bits_per_field[i];
+
+		if (r <= bits) {
+			set_format_field(&ret, i,
+					 ret.bits_per_field[i] + r,
+					 le64_to_cpu(ret.field_offset[i]));
+			bits -= r;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	{
+		struct printbuf buf = PRINTBUF;
+
+		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+		printbuf_exit(&buf);
+	}
+#endif
+	return ret;
+}
+
+int bch2_bkey_format_invalid(struct bch_fs *c,
+			     struct bkey_format *f,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	unsigned i, bits = KEY_PACKED_BITS_START;
+
+	if (f->nr_fields != BKEY_NR_FIELDS) {
+		prt_printf(err, "incorrect number of fields: got %u, should be %u",
+			   f->nr_fields, BKEY_NR_FIELDS);
+		return -BCH_ERR_invalid;
+	}
+
+	/*
+	 * Verify that the packed format can't represent fields larger than the
+	 * unpacked format:
+	 */
+	for (i = 0; i < f->nr_fields; i++) {
+		if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+			u64 packed_max = f->bits_per_field[i]
+				? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+				: 0;
+			u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+			if (packed_max + field_offset < packed_max ||
+			    packed_max + field_offset > unpacked_max) {
+				prt_printf(err, "field %u too large: %llu + %llu > %llu",
+					   i, packed_max, field_offset, unpacked_max);
+				return -BCH_ERR_invalid;
+			}
+		}
+
+		bits += f->bits_per_field[i];
+	}
+
+	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+			   f->key_u64s, DIV_ROUND_UP(bits, 64));
+		return -BCH_ERR_invalid;
+	}
+
+	return 0;
+}
+
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+	prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%u:%llu",
+			   f->bits_per_field[i],
+			   le64_to_cpu(f->field_offset[i]));
+	}
+}
+
+/*
+ * Most significant differing bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
+					  const struct bkey_packed *l_k,
+					  const struct bkey_packed *r_k)
+{
+	const u64 *l = high_word(&b->format, l_k);
+	const u64 *r = high_word(&b->format, r_k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned word_bits = 64 - high_bit_offset;
+	u64 l_v, r_v;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	/* for big endian, skip past header */
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (nr_key_bits) {
+		if (nr_key_bits < word_bits) {
+			l_v >>= word_bits - nr_key_bits;
+			r_v >>= word_bits - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= word_bits;
+		}
+
+		if (l_v != r_v)
+			return fls64(l_v ^ r_v) - 1 + nr_key_bits;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+		word_bits = 64;
+	}
+
+	return 0;
+}
+
+/*
+ * First set bit
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
+ */
+__pure
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
+{
+	const u64 *p = high_word(&b->format, k);
+	unsigned nr_key_bits = b->nr_key_bits;
+	unsigned ret = 0, offset;
+
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
+
+	offset = nr_key_bits;
+	while (offset > 64) {
+		p = next_word(p);
+		offset -= 64;
+	}
+
+	offset = 64 - offset;
+
+	while (nr_key_bits) {
+		unsigned bits = nr_key_bits + offset < 64
+			? nr_key_bits
+			: 64 - offset;
+
+		u64 mask = (~0ULL >> (64 - bits)) << offset;
+
+		if (*p & mask)
+			return ret + __ffs64(*p & mask) - offset;
+
+		p = prev_word(p);
+		nr_key_bits -= bits;
+		ret += bits;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+#define I(_x)			(*(out)++ = (_x))
+#define I1(i0)						I(i0)
+#define I2(i0, i1)		(I1(i0),		I(i1))
+#define I3(i0, i1, i2)		(I2(i0, i1),		I(i2))
+#define I4(i0, i1, i2, i3)	(I3(i0, i1, i2),	I(i3))
+#define I5(i0, i1, i2, i3, i4)	(I4(i0, i1, i2, i3),	I(i4))
+
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
+			      enum bch_bkey_fields field,
+			      unsigned dst_offset, unsigned dst_size,
+			      bool *eax_zeroed)
+{
+	unsigned bits = format->bits_per_field[field];
+	u64 offset = le64_to_cpu(format->field_offset[field]);
+	unsigned i, byte, bit_offset, align, shl, shr;
+
+	if (!bits && !offset) {
+		if (!*eax_zeroed) {
+			/* xor eax, eax */
+			I2(0x31, 0xc0);
+		}
+
+		*eax_zeroed = true;
+		goto set_field;
+	}
+
+	if (!bits) {
+		/* just return offset: */
+
+		switch (dst_size) {
+		case 8:
+			if (offset > S32_MAX) {
+				/* mov [rdi + dst_offset], offset */
+				I3(0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+
+				I3(0xc7, 0x47, dst_offset + 4);
+				memcpy(out, (void *) &offset + 4, 4);
+				out += 4;
+			} else {
+				/* mov [rdi + dst_offset], offset */
+				/* sign extended */
+				I4(0x48, 0xc7, 0x47, dst_offset);
+				memcpy(out, &offset, 4);
+				out += 4;
+			}
+			break;
+		case 4:
+			/* mov [rdi + dst_offset], offset */
+			I3(0xc7, 0x47, dst_offset);
+			memcpy(out, &offset, 4);
+			out += 4;
+			break;
+		default:
+			BUG();
+		}
+
+		return out;
+	}
+
+	bit_offset = format->key_u64s * 64;
+	for (i = 0; i <= field; i++)
+		bit_offset -= format->bits_per_field[i];
+
+	byte = bit_offset / 8;
+	bit_offset -= byte * 8;
+
+	*eax_zeroed = false;
+
+	if (bit_offset == 0 && bits == 8) {
+		/* movzx eax, BYTE PTR [rsi + imm8] */
+		I4(0x0f, 0xb6, 0x46, byte);
+	} else if (bit_offset == 0 && bits == 16) {
+		/* movzx eax, WORD PTR [rsi + imm8] */
+		I4(0x0f, 0xb7, 0x46, byte);
+	} else if (bit_offset + bits <= 32) {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 32);
+
+		/* mov eax, [rsi + imm8] */
+		I3(0x8b, 0x46, byte);
+
+		if (bit_offset) {
+			/* shr eax, imm8 */
+			I3(0xc1, 0xe8, bit_offset);
+		}
+
+		if (bit_offset + bits < 32) {
+			unsigned mask = ~0U >> (32 - bits);
+
+			/* and eax, imm32 */
+			I1(0x25);
+			memcpy(out, &mask, 4);
+			out += 4;
+		}
+	} else if (bit_offset + bits <= 64) {
+		align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 64);
+
+		/* mov rax, [rsi + imm8] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		shl = 64 - bit_offset - bits;
+		shr = bit_offset + shl;
+
+		if (shl) {
+			/* shl rax, imm8 */
+			I4(0x48, 0xc1, 0xe0, shl);
+		}
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	} else {
+		align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
+		byte -= align;
+		bit_offset += align * 8;
+
+		BUG_ON(bit_offset + bits > 96);
+
+		/* mov rax, [rsi + byte] */
+		I4(0x48, 0x8b, 0x46, byte);
+
+		/* mov edx, [rsi + byte + 8] */
+		I3(0x8b, 0x56, byte + 8);
+
+		/* bits from next word: */
+		shr = bit_offset + bits - 64;
+		BUG_ON(shr > bit_offset);
+
+		/* shr rax, bit_offset */
+		I4(0x48, 0xc1, 0xe8, shr);
+
+		/* shl rdx, imm8 */
+		I4(0x48, 0xc1, 0xe2, 64 - shr);
+
+		/* or rax, rdx */
+		I3(0x48, 0x09, 0xd0);
+
+		shr = bit_offset - shr;
+
+		if (shr) {
+			/* shr rax, imm8 */
+			I4(0x48, 0xc1, 0xe8, shr);
+		}
+	}
+
+	/* rax += offset: */
+	if (offset > S32_MAX) {
+		/* mov rdx, imm64 */
+		I2(0x48, 0xba);
+		memcpy(out, &offset, 8);
+		out += 8;
+		/* add %rdx, %rax */
+		I3(0x48, 0x01, 0xd0);
+	} else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
+		/* add rax, imm32 */
+		I2(0x48, 0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	} else if (offset) {
+		/* add eax, imm32 */
+		I1(0x05);
+		memcpy(out, &offset, 4);
+		out += 4;
+	}
+set_field:
+	switch (dst_size) {
+	case 8:
+		/* mov [rdi + dst_offset], rax */
+		I4(0x48, 0x89, 0x47, dst_offset);
+		break;
+	case 4:
+		/* mov [rdi + dst_offset], eax */
+		I3(0x89, 0x47, dst_offset);
+		break;
+	default:
+		BUG();
+	}
+
+	return out;
+}
+
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
+{
+	bool eax_zeroed = false;
+	u8 *out = _out;
+
+	/*
+	 * rdi: dst - unpacked key
+	 * rsi: src - packed key
+	 */
+
+	/* k->u64s, k->format, k->type */
+
+	/* mov eax, [rsi] */
+	I2(0x8b, 0x06);
+
+	/* add eax, BKEY_U64s - format->key_u64s */
+	I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
+
+	/* and eax, imm32: mask out k->pad: */
+	I5(0x25, 0xff, 0xff, 0xff, 0);
+
+	/* mov [rdi], eax */
+	I2(0x89, 0x07);
+
+#define x(id, field)							\
+	out = compile_bkey_field(format, out, id,			\
+				 offsetof(struct bkey, field),		\
+				 sizeof(((struct bkey *) NULL)->field),	\
+				 &eax_zeroed);
+	bkey_fields()
+#undef x
+
+	/* retq */
+	I1(0xc3);
+
+	return (void *) out - _out;
+}
+
+#else
+#endif
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
+					       const struct bkey_packed *l,
+					       const struct bpos *r)
+{
+	return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+}
+
+__pure __flatten
+int bch2_bkey_cmp_packed(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed_inlined(b, l, r);
+}
+
+__pure __flatten
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bpos *r)
+{
+	const struct bkey *l_unpacked;
+
+	return unlikely(l_unpacked = packed_to_bkey_c(l))
+		? bpos_cmp(l_unpacked->p, *r)
+		: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+void bch2_bpos_swab(struct bpos *p)
+{
+	u8 *l = (u8 *) p;
+	u8 *h = ((u8 *) &p[1]) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
+{
+	const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
+	u8 *l = k->key_start;
+	u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
+
+	while (l < h) {
+		swap(*l, *h);
+		l++;
+		--h;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void)
+{
+	struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
+	struct bkey_packed p;
+
+	struct bkey_format test_format = {
+		.key_u64s	= 3,
+		.nr_fields	= BKEY_NR_FIELDS,
+		.bits_per_field = {
+			13,
+			64,
+			32,
+		},
+	};
+
+	struct unpack_state in_s =
+		unpack_state_init(&bch2_bkey_format_current, (void *) &t);
+	struct pack_state out_s = pack_state_init(&test_format, &p);
+	unsigned i;
+
+	for (i = 0; i < out_s.format->nr_fields; i++) {
+		u64 a, v = get_inc_field(&in_s, i);
+
+		switch (i) {
+#define x(id, field)	case id: a = t.field; break;
+	bkey_fields()
+#undef x
+		default:
+			BUG();
+		}
+
+		if (a != v)
+			panic("got %llu actual %llu i %u\n", v, a, i);
+
+		if (!set_inc_field(&out_s, i, v))
+			panic("failed at %u\n", i);
+	}
+
+	BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
+}
+#endif
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
new file mode 100644
index 000000000000..831be01809f2
--- /dev/null
+++ b/fs/bcachefs/bkey.h
@@ -0,0 +1,778 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "btree_types.h"
+#include "util.h"
+#include "vstructs.h"
+
+enum bkey_invalid_flags {
+	BKEY_INVALID_WRITE		= (1U << 0),
+	BKEY_INVALID_COMMIT		= (1U << 1),
+	BKEY_INVALID_JOURNAL		= (1U << 2),
+};
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+#endif
+
+void bch2_bkey_packed_to_binary_text(struct printbuf *,
+				     const struct bkey_format *,
+				     const struct bkey_packed *);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+	memcpy_u64s_small(dst, src, src->u64s);
+}
+
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+	memcpy_u64s_small(dst, src, src->k.u64s);
+}
+
+struct btree;
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int bch2_bkey_cmp_packed(const struct btree *,
+			 const struct bkey_packed *,
+			 const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+				const struct bkey_packed *,
+				const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * The compiler generates better code when we pass bpos by ref, but it's often
+ * enough terribly convenient to pass it by val... as much as I hate c++, const
+ * ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+static __always_inline bool bpos_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset) |
+		  (l.snapshot	^ r.snapshot));
+}
+
+static __always_inline bool bpos_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false;
+}
+
+static __always_inline bool bpos_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode ? l.inode < r.inode :
+		l.offset != r.offset ? l.offset < r.offset :
+		l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true;
+}
+
+static __always_inline bool bpos_gt(struct bpos l, struct bpos r)
+{
+	return bpos_lt(r, l);
+}
+
+static __always_inline bool bpos_ge(struct bpos l, struct bpos r)
+{
+	return bpos_le(r, l);
+}
+
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset) ?:
+		cmp_int(l.snapshot, r.snapshot);
+}
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bpos_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+	return bpos_gt(l, r) ? l : r;
+}
+
+static __always_inline bool bkey_eq(struct bpos l, struct bpos r)
+{
+	return  !((l.inode	^ r.inode) |
+		  (l.offset	^ r.offset));
+}
+
+static __always_inline bool bkey_lt(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset < r.offset;
+}
+
+static __always_inline bool bkey_le(struct bpos l, struct bpos r)
+{
+	return  l.inode	!= r.inode
+		? l.inode < r.inode
+		: l.offset <= r.offset;
+}
+
+static __always_inline bool bkey_gt(struct bpos l, struct bpos r)
+{
+	return bkey_lt(r, l);
+}
+
+static __always_inline bool bkey_ge(struct bpos l, struct bpos r)
+{
+	return bkey_le(r, l);
+}
+
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	return  cmp_int(l.inode,    r.inode) ?:
+		cmp_int(l.offset,   r.offset);
+}
+
+static inline struct bpos bkey_min(struct bpos l, struct bpos r)
+{
+	return bkey_lt(l, r) ? l : r;
+}
+
+static inline struct bpos bkey_max(struct bpos l, struct bpos r)
+{
+	return bkey_gt(l, r) ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+	return  cmp_int(l.hi, r.hi) ?:
+		cmp_int(l.lo, r.lo);
+}
+
+#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+	return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bpos_successor(struct bpos p)
+{
+	if (!++p.snapshot &&
+	    !++p.offset &&
+	    !++p.inode)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+	if (!p.snapshot-- &&
+	    !p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!++p.offset &&
+	    !++p.inode)
+		BUG();
+
+	return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+	p.snapshot = 0;
+
+	if (!p.offset-- &&
+	    !p.inode--)
+		BUG();
+
+	return p;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+			 struct bkey_packed *,
+			 const struct bkey_format *,
+			 const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+				   const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+		    bch2_expensive_debug_checks) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	} else {
+		*dst = __bch2_bkey_unpack_key(&b->format, src);
+	}
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(const struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(const struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
+		: U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	dst->k = *src.k;
+	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)		((u64 *) (k)->_data + high_word_offset(f))
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#define bkey_fields()							\
+	x(BKEY_FIELD_INODE,		p.inode)			\
+	x(BKEY_FIELD_OFFSET,		p.offset)			\
+	x(BKEY_FIELD_SNAPSHOT,		p.snapshot)			\
+	x(BKEY_FIELD_SIZE,		size)				\
+	x(BKEY_FIELD_VERSION_HI,	version.hi)			\
+	x(BKEY_FIELD_VERSION_LO,	version.lo)
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+
+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v)
+{
+	s->field_min[field] = min(s->field_min[field], v);
+	s->field_max[field] = max(s->field_max[field], v);
+}
+
+/*
+ * Changes @format so that @k can be successfully packed with @format
+ */
+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
+{
+#define x(id, field) __bkey_format_add(s, id, k->field);
+	bkey_fields()
+#undef x
+}
+
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
+
+#endif /* _BCACHEFS_BKEY_H */
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
new file mode 100644
index 000000000000..a30c4ae8eb36
--- /dev/null
+++ b/fs/bcachefs/bkey_buf.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+
+struct bkey_buf {
+	struct bkey_i	*k;
+	u64		onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+					 struct bch_fs *c, unsigned u64s)
+{
+	if (s->k == (void *) s->onstack &&
+	    u64s > ARRAY_SIZE(s->onstack)) {
+		s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+		memcpy(s->k, s->onstack, sizeof(s->onstack));
+	}
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+					    struct bch_fs *c,
+					    struct bkey_s_c k)
+{
+	bch2_bkey_buf_realloc(s, c, k.k->u64s);
+	bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+				      struct bch_fs *c,
+				      struct bkey_i *src)
+{
+	bch2_bkey_buf_realloc(s, c, src->k.u64s);
+	bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+					struct bch_fs *c,
+					struct btree *b,
+					struct bkey_packed *src)
+{
+	bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+			      bkeyp_val_u64s(&b->format, src));
+	bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+	s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+	if (s->k != (void *) s->onstack)
+		mempool_free(s->k, &c->large_bkey_pool);
+	s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
new file mode 100644
index 000000000000..5f42a6e69360
--- /dev/null
+++ b/fs/bcachefs/bkey_cmp.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_CMP_H
+#define _BCACHEFS_BKEY_CMP_H
+
+#include "bkey.h"
+
+#ifdef CONFIG_X86_64
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	long d0, d1, d2, d3;
+	int cmp;
+
+	/* we shouldn't need asm for this, but gcc is being retarded: */
+
+	asm(".intel_syntax noprefix;"
+	    "xor eax, eax;"
+	    "xor edx, edx;"
+	    "1:;"
+	    "mov r8, [rdi];"
+	    "mov r9, [rsi];"
+	    "sub ecx, 64;"
+	    "jl 2f;"
+
+	    "cmp r8, r9;"
+	    "jnz 3f;"
+
+	    "lea rdi, [rdi - 8];"
+	    "lea rsi, [rsi - 8];"
+	    "jmp 1b;"
+
+	    "2:;"
+	    "not ecx;"
+	    "shr r8, 1;"
+	    "shr r9, 1;"
+	    "shr r8, cl;"
+	    "shr r9, cl;"
+	    "cmp r8, r9;"
+
+	    "3:\n"
+	    "seta al;"
+	    "setb dl;"
+	    "sub eax, edx;"
+	    ".att_syntax prefix;"
+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
+	    : "0" (l), "1" (r), "3" (nr_key_bits)
+	    : "r8", "r9", "cc", "memory");
+
+	return cmp;
+}
+#else
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
+				  unsigned nr_key_bits)
+{
+	u64 l_v, r_v;
+
+	if (!nr_key_bits)
+		return 0;
+
+	/* for big endian, skip past header */
+	nr_key_bits += high_bit_offset;
+	l_v = *l & (~0ULL >> high_bit_offset);
+	r_v = *r & (~0ULL >> high_bit_offset);
+
+	while (1) {
+		if (nr_key_bits < 64) {
+			l_v >>= 64 - nr_key_bits;
+			r_v >>= 64 - nr_key_bits;
+			nr_key_bits = 0;
+		} else {
+			nr_key_bits -= 64;
+		}
+
+		if (!nr_key_bits || l_v != r_v)
+			break;
+
+		l = next_word(l);
+		r = next_word(r);
+
+		l_v = *l;
+		r_v = *r;
+	}
+
+	return cmp_int(l_v, r_v);
+}
+#endif
+
+static inline __pure __flatten
+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
+					  const struct bkey_packed *r,
+					  const struct btree *b)
+{
+	const struct bkey_format *f = &b->format;
+	int ret;
+
+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
+
+	ret = __bkey_cmp_bits(high_word(f, l),
+			      high_word(f, r),
+			      b->nr_key_bits);
+
+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
+				bkey_unpack_pos(b, r)));
+	return ret;
+}
+
+static inline __pure __flatten
+int bch2_bkey_cmp_packed_inlined(const struct btree *b,
+			 const struct bkey_packed *l,
+			 const struct bkey_packed *r)
+{
+	struct bkey unpacked;
+
+	if (likely(bkey_packed(l) && bkey_packed(r)))
+		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
+
+	if (bkey_packed(l)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, l);
+		l = (void *) &unpacked;
+	} else if (bkey_packed(r)) {
+		__bkey_unpack_key_format_checked(b, &unpacked, r);
+		r = (void *) &unpacked;
+	}
+
+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+}
+
+#endif /* _BCACHEFS_BKEY_CMP_H */
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
new file mode 100644
index 000000000000..761f5e33b1e6
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_types.h"
+#include "alloc_background.h"
+#include "dirent.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "lru.h"
+#include "quota.h"
+#include "reflink.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+const char * const bch2_bkey_types[] = {
+#define x(name, nr) #name,
+	BCH_BKEY_TYPES()
+#undef x
+	NULL
+};
+
+static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+			       enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+})
+
+#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
+	.key_invalid = deleted_key_invalid,		\
+})
+
+static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+				 enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
+			 bkey_val_size_nonzero,
+			 "incorrect value size (%zu != 0)",
+			 bkey_val_bytes(k.k));
+fsck_err:
+	return ret;
+}
+
+#define bch2_bkey_ops_error ((struct bkey_ops) {	\
+	.key_invalid = empty_val_key_invalid,		\
+})
+
+static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
+				   enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
+	.key_invalid	= key_type_cookie_invalid,	\
+	.min_val_size	= 8,				\
+})
+
+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
+	.key_invalid = empty_val_key_invalid,		\
+})
+
+static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+					enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	return 0;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+					 struct bkey_s_c k)
+{
+	struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "datalen %u: %*phN",
+	       datalen, min(datalen, 32U), d.v->data);
+}
+
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
+	.key_invalid	= key_type_inline_data_invalid,	\
+	.val_to_text	= key_type_inline_data_to_text,	\
+})
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+#define bch2_bkey_ops_set ((struct bkey_ops) {		\
+	.key_invalid	= empty_val_key_invalid,	\
+	.key_merge	= key_type_set_merge,		\
+})
+
+const struct bkey_ops bch2_bkey_ops[] = {
+#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
+	BCH_BKEY_TYPES()
+#undef x
+};
+
+const struct bkey_ops bch2_bkey_null_ops = {
+};
+
+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
+			 bkey_val_size_too_small,
+			 "bad val size (%zu < %u)",
+			 bkey_val_bytes(k.k), ops->min_val_size);
+
+	if (!ops->key_invalid)
+		return 0;
+
+	ret = ops->key_invalid(c, k, flags, err);
+fsck_err:
+	return ret;
+}
+
+static u64 bch2_key_types_allowed[] = {
+	[BKEY_TYPE_btree] =
+		BIT_ULL(KEY_TYPE_deleted)|
+		BIT_ULL(KEY_TYPE_btree_ptr)|
+		BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+	BCH_BTREE_IDS()
+#undef x
+};
+
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
+
+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+			enum btree_node_type type,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
+			 bkey_u64s_too_small,
+			 "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
+
+	if (type >= BKEY_TYPE_NR)
+		return 0;
+
+	bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
+			 bkey_invalid_type_for_btree,
+			 "invalid key type for btree %s (%s)",
+			 bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+
+	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+		bkey_fsck_err_on(k.k->size == 0, c, err,
+				 bkey_extent_size_zero,
+				 "size == 0");
+
+		bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
+				 bkey_extent_size_greater_than_offset,
+				 "size greater than offset (%u > %llu)",
+				 k.k->size, k.k->p.offset);
+	} else {
+		bkey_fsck_err_on(k.k->size, c, err,
+				 bkey_size_nonzero,
+				 "size != 0");
+	}
+
+	if (type != BKEY_TYPE_btree) {
+		enum btree_id btree = type - 1;
+
+		if (btree_type_has_snapshots(btree)) {
+			bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+					 bkey_snapshot_zero,
+					 "snapshot == 0");
+		} else if (!btree_type_has_snapshot_field(btree)) {
+			bkey_fsck_err_on(k.k->p.snapshot, c, err,
+					 bkey_snapshot_nonzero,
+					 "nonzero snapshot");
+		} else {
+			/*
+			 * btree uses snapshot field but it's not required to be
+			 * nonzero
+			 */
+		}
+
+		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
+				 bkey_at_pos_max,
+				 "key at POS_MAX");
+	}
+fsck_err:
+	return ret;
+}
+
+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+		      enum btree_node_type type,
+		      enum bkey_invalid_flags flags,
+		      struct printbuf *err)
+{
+	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
+		bch2_bkey_val_invalid(c, k, flags, err);
+}
+
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k, struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
+			 bkey_before_start_of_btree_node,
+			 "key before start of btree node");
+
+	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
+			 bkey_after_end_of_btree_node,
+			 "key past end of btree node");
+fsck_err:
+	return ret;
+}
+
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
+{
+	if (bpos_eq(pos, POS_MIN))
+		prt_printf(out, "POS_MIN");
+	else if (bpos_eq(pos, POS_MAX))
+		prt_printf(out, "POS_MAX");
+	else if (bpos_eq(pos, SPOS_MAX))
+		prt_printf(out, "SPOS_MAX");
+	else {
+		if (pos.inode == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.inode);
+		prt_printf(out, ":");
+		if (pos.offset == U64_MAX)
+			prt_printf(out, "U64_MAX");
+		else
+			prt_printf(out, "%llu", pos.offset);
+		prt_printf(out, ":");
+		if (pos.snapshot == U32_MAX)
+			prt_printf(out, "U32_MAX");
+		else
+			prt_printf(out, "%u", pos.snapshot);
+	}
+}
+
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
+{
+	if (k) {
+		prt_printf(out, "u64s %u type ", k->u64s);
+
+		if (k->type < KEY_TYPE_MAX)
+			prt_printf(out, "%s ", bch2_bkey_types[k->type]);
+		else
+			prt_printf(out, "%u ", k->type);
+
+		bch2_bpos_to_text(out, k->p);
+
+		prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+	} else {
+		prt_printf(out, "(null)");
+	}
+}
+
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (likely(ops->val_to_text))
+		ops->val_to_text(out, c, k);
+}
+
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	bch2_bkey_to_text(out, k.k);
+
+	if (bkey_val_bytes(k.k)) {
+		prt_printf(out, ": ");
+		bch2_val_to_text(out, c, k);
+	}
+}
+
+void bch2_bkey_swab_val(struct bkey_s k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	if (ops->swab)
+		ops->swab(k);
+}
+
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+
+	return ops->key_normalize
+		? ops->key_normalize(c, k)
+		: false;
+}
+
+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type);
+
+	return ops->key_merge &&
+		bch2_bkey_maybe_mergable(l.k, r.k) &&
+		(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
+		!bch2_key_merging_disabled &&
+		ops->key_merge(c, l, r);
+}
+
+static const struct old_bkey_type {
+	u8		btree_node_type;
+	u8		old;
+	u8		new;
+} bkey_renumber_table[] = {
+	{BKEY_TYPE_btree,	128, KEY_TYPE_btree_ptr		},
+	{BKEY_TYPE_extents,	128, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	129, KEY_TYPE_extent		},
+	{BKEY_TYPE_extents,	130, KEY_TYPE_reservation	},
+	{BKEY_TYPE_inodes,	128, KEY_TYPE_inode		},
+	{BKEY_TYPE_inodes,	130, KEY_TYPE_inode_generation	},
+	{BKEY_TYPE_dirents,	128, KEY_TYPE_dirent		},
+	{BKEY_TYPE_dirents,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_xattrs,	128, KEY_TYPE_xattr		},
+	{BKEY_TYPE_xattrs,	129, KEY_TYPE_hash_whiteout	},
+	{BKEY_TYPE_alloc,	128, KEY_TYPE_alloc		},
+	{BKEY_TYPE_quotas,	128, KEY_TYPE_quota		},
+};
+
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
+			struct bkey_packed *k,
+			int write)
+{
+	const struct old_bkey_type *i;
+
+	for (i = bkey_renumber_table;
+	     i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
+	     i++)
+		if (btree_node_type == i->btree_node_type &&
+		    k->type == (write ? i->new : i->old)) {
+			k->type = write ? i->old : i->new;
+			break;
+		}
+}
+
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			unsigned version, unsigned big_endian,
+			int write,
+			struct bkey_format *f,
+			struct bkey_packed *k)
+{
+	const struct bkey_ops *ops;
+	struct bkey uk;
+	unsigned nr_compat = 5;
+	int i;
+
+	/*
+	 * Do these operations in reverse order in the write path:
+	 */
+
+	for (i = 0; i < nr_compat; i++)
+	switch (!write ? i : nr_compat - 1 - i) {
+	case 0:
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_key(f, k);
+		break;
+	case 1:
+		if (version < bcachefs_metadata_version_bkey_renumber)
+			bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+		break;
+	case 2:
+		if (version < bcachefs_metadata_version_inode_btree_change &&
+		    btree_id == BTREE_ID_inodes) {
+			if (!bkey_packed(k)) {
+				struct bkey_i *u = packed_to_bkey(k);
+
+				swap(u->k.p.inode, u->k.p.offset);
+			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
+				struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+				swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+				     tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+				swap(tmp.field_offset[BKEY_FIELD_INODE],
+				     tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+				if (!write)
+					swap(in, out);
+
+				uk = __bch2_bkey_unpack_key(in, k);
+				swap(uk.p.inode, uk.p.offset);
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+			}
+		}
+		break;
+	case 3:
+		if (version < bcachefs_metadata_version_snapshot &&
+		    (level || btree_type_has_snapshots(btree_id))) {
+			struct bkey_i *u = packed_to_bkey(k);
+
+			if (u) {
+				u->k.p.snapshot = write
+					? 0 : U32_MAX;
+			} else {
+				u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]);
+				u64 max_packed = min_packed +
+					~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+				uk = __bch2_bkey_unpack_key(f, k);
+				uk.p.snapshot = write
+					? min_packed : min_t(u64, U32_MAX, max_packed);
+
+				BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+			}
+		}
+
+		break;
+	case 4: {
+		struct bkey_s u;
+
+		if (!bkey_packed(k)) {
+			u = bkey_i_to_s(packed_to_bkey(k));
+		} else {
+			uk = __bch2_bkey_unpack_key(f, k);
+			u.k = &uk;
+			u.v = bkeyp_val(f, k);
+		}
+
+		if (big_endian != CPU_BIG_ENDIAN)
+			bch2_bkey_swab_val(u);
+
+		ops = bch2_bkey_type_ops(k->type);
+
+		if (ops->compat)
+			ops->compat(btree_id, version, big_endian, write, u);
+		break;
+	}
+	default:
+		BUG();
+	}
+}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
new file mode 100644
index 000000000000..3a370b7087ac
--- /dev/null
+++ b/fs/bcachefs/bkey_methods.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+struct bch_fs;
+struct btree;
+struct btree_trans;
+struct bkey;
+enum btree_node_type;
+
+extern const char * const bch2_bkey_types[];
+extern const struct bkey_ops bch2_bkey_null_ops;
+
+/*
+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * invalid, entire key will be deleted.
+ *
+ * When invalid, error string is returned via @err. @rw indicates whether key is
+ * being read or written; more aggressive checks can be enabled when rw == WRITE.
+ */
+struct bkey_ops {
+	int		(*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
+				       enum bkey_invalid_flags flags, struct printbuf *err);
+	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*swab)(struct bkey_s);
+	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
+	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+	int		(*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					 struct bkey_s_c, struct bkey_i *, unsigned);
+	int		(*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
+					  struct bkey_s_c, struct bkey_s_c, unsigned);
+	void		(*compat)(enum btree_id id, unsigned version,
+				  unsigned big_endian, int write,
+				  struct bkey_s);
+
+	/* Size of value type when first created: */
+	unsigned	min_val_size;
+};
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
+{
+	return likely(type < KEY_TYPE_MAX)
+		? &bch2_bkey_ops[type]
+		: &bch2_bkey_null_ops;
+}
+
+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+			enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+		      enum bkey_invalid_flags, struct printbuf *);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
+			    struct bkey_s_c, struct printbuf *);
+
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
+		      struct bkey_s_c);
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
+			   struct bkey_s_c);
+
+void bch2_bkey_swab_val(struct bkey_s);
+
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
+
+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
+{
+	return l->type == r->type &&
+		!bversion_cmp(l->version, r->version) &&
+		bpos_eq(l->p, bkey_start_pos(r));
+}
+
+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+static inline int bch2_mark_key(struct btree_trans *trans,
+		enum btree_id btree, unsigned level,
+		struct bkey_s_c old, struct bkey_s_c new,
+		unsigned flags)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
+
+	return ops->atomic_trigger
+		? ops->atomic_trigger(trans, btree, level, old, new, flags)
+		: 0;
+}
+
+enum btree_update_flags {
+	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
+	__BTREE_UPDATE_NOJOURNAL,
+	__BTREE_UPDATE_PREJOURNAL,
+	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
+
+	__BTREE_TRIGGER_INSERT,
+	__BTREE_TRIGGER_OVERWRITE,
+
+	__BTREE_TRIGGER_GC,
+	__BTREE_TRIGGER_BUCKET_INVALIDATE,
+	__BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_NOJOURNAL		(1U << __BTREE_UPDATE_NOJOURNAL)
+#define BTREE_UPDATE_PREJOURNAL		(1U << __BTREE_UPDATE_PREJOURNAL)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT		(1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE		(1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC		(1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE	(1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC		(1U << __BTREE_TRIGGER_NOATOMIC)
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, struct bkey_i *new,
+				      unsigned flags)
+{
+	const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+
+	return ops->trans_trigger
+		? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+		: 0;
+}
+
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_s_c old, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = old.k->p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+				   BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+				      enum btree_id btree_id, unsigned level,
+				      struct bkey_i *new, unsigned flags)
+{
+	struct bkey_i deleted;
+
+	bkey_init(&deleted.k);
+	deleted.k.p = new->k.p;
+
+	return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+				   BTREE_TRIGGER_INSERT|flags);
+}
+
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
+
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
+			int, struct bkey_format *, struct bkey_packed *);
+
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write,
+			       struct bkey_format *f,
+			       struct bkey_packed *k)
+{
+	if (version < bcachefs_metadata_version_current ||
+	    big_endian != CPU_BIG_ENDIAN)
+		__bch2_bkey_compat(level, btree_id, version,
+				   big_endian, write, f, k);
+
+}
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
new file mode 100644
index 000000000000..bcca9e76a0b4
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_cmp.h"
+#include "bkey_sort.h"
+#include "bset.h"
+#include "extents.h"
+
+typedef int (*sort_cmp_fn)(struct btree *,
+			   struct bkey_packed *,
+			   struct bkey_packed *);
+
+static inline bool sort_iter_end(struct sort_iter *iter)
+{
+	return !iter->used;
+}
+
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+				  sort_cmp_fn cmp)
+{
+	unsigned i;
+
+	for (i = from;
+	     i + 1 < iter->used &&
+	     cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
+	     i++)
+		swap(iter->data[i], iter->data[i + 1]);
+}
+
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	unsigned i = iter->used;
+
+	while (i--)
+		sort_iter_sift(iter, i, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
+{
+	return !sort_iter_end(iter) ? iter->data->k : NULL;
+}
+
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
+{
+	struct sort_iter_set *i = iter->data;
+
+	BUG_ON(!iter->used);
+
+	i->k = bkey_p_next(i->k);
+
+	BUG_ON(i->k > i->end);
+
+	if (i->k == i->end)
+		array_remove_item(iter->data, iter->used, 0);
+	else
+		sort_iter_sift(iter, 0, cmp);
+}
+
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
+						 sort_cmp_fn cmp)
+{
+	struct bkey_packed *ret = sort_iter_peek(iter);
+
+	if (ret)
+		sort_iter_advance(iter, cmp);
+
+	return ret;
+}
+
+/*
+ * If keys compare equal, compare by pointer order:
+ */
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
+					       struct bkey_packed *l,
+					       struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r) ?:
+		cmp_int((unsigned long) l, (unsigned long) r);
+}
+
+static inline bool should_drop_next_key(struct sort_iter *iter)
+{
+	/*
+	 * key_sort_cmp() ensures that when keys compare equal the older key
+	 * comes first; so if l->k compares equal to r->k then l->k is older
+	 * and should be dropped.
+	 */
+	return iter->used >= 2 &&
+		!bch2_bkey_cmp_packed(iter->b,
+				 iter->data[0].k,
+				 iter->data[1].k);
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+			      struct sort_iter *iter)
+{
+	struct bkey_packed *out = dst->start;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr;
+
+	memset(&nr, 0, sizeof(nr));
+
+	sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
+
+	while ((k = sort_iter_peek(iter))) {
+		if (!bkey_deleted(k) &&
+		    !should_drop_next_key(iter)) {
+			bkey_p_copy(out, k);
+			btree_keys_account_key_add(&nr, 0, out);
+			out = bkey_p_next(out);
+		}
+
+		sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+/* Sort + repack in a new format: */
+struct btree_nr_keys
+bch2_sort_repack(struct bset *dst, struct btree *src,
+		 struct btree_node_iter *src_iter,
+		 struct bkey_format *out_f,
+		 bool filter_whiteouts)
+{
+	struct bkey_format *in_f = &src->format;
+	struct bkey_packed *in, *out = vstruct_last(dst);
+	struct btree_nr_keys nr;
+	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
+
+	memset(&nr, 0, sizeof(nr));
+
+	while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
+		if (filter_whiteouts && bkey_deleted(in))
+			continue;
+
+		if (!transform)
+			bkey_p_copy(out, in);
+		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+					     ? in_f : &bch2_bkey_format_current, in))
+			out->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(src, (void *) out, in);
+
+		out->needs_whiteout = false;
+
+		btree_keys_account_key_add(&nr, 0, out);
+		out = bkey_p_next(out);
+	}
+
+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
+	return nr;
+}
+
+static inline int sort_keys_cmp(struct btree *b,
+				struct bkey_packed *l,
+				struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
+		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
+		(int) l->needs_whiteout - (int) r->needs_whiteout;
+}
+
+unsigned bch2_sort_keys(struct bkey_packed *dst,
+			struct sort_iter *iter,
+			bool filter_whiteouts)
+{
+	const struct bkey_format *f = &iter->b->format;
+	struct bkey_packed *in, *next, *out = dst;
+
+	sort_iter_sort(iter, sort_keys_cmp);
+
+	while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+		bool needs_whiteout = false;
+
+		if (bkey_deleted(in) &&
+		    (filter_whiteouts || !in->needs_whiteout))
+			continue;
+
+		while ((next = sort_iter_peek(iter)) &&
+		       !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
+			BUG_ON(in->needs_whiteout &&
+			       next->needs_whiteout);
+			needs_whiteout |= in->needs_whiteout;
+			in = sort_iter_next(iter, sort_keys_cmp);
+		}
+
+		if (bkey_deleted(in)) {
+			memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
+			set_bkeyp_val_u64s(f, out, 0);
+		} else {
+			bkey_p_copy(out, in);
+		}
+		out->needs_whiteout |= needs_whiteout;
+		out = bkey_p_next(out);
+	}
+
+	return (u64 *) out - (u64 *) dst;
+}
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
new file mode 100644
index 000000000000..7c0f0b160f18
--- /dev/null
+++ b/fs/bcachefs/bkey_sort.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_SORT_H
+#define _BCACHEFS_BKEY_SORT_H
+
+struct sort_iter {
+	struct btree		*b;
+	unsigned		used;
+	unsigned		size;
+
+	struct sort_iter_set {
+		struct bkey_packed *k, *end;
+	} data[];
+};
+
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
+{
+	iter->b = b;
+	iter->used = 0;
+	iter->size = size;
+}
+
+struct sort_iter_stack {
+	struct sort_iter	iter;
+	struct sort_iter_set	sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+	sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
+}
+
+static inline void sort_iter_add(struct sort_iter *iter,
+				 struct bkey_packed *k,
+				 struct bkey_packed *end)
+{
+	BUG_ON(iter->used >= iter->size);
+
+	if (k != end)
+		iter->data[iter->used++] = (struct sort_iter_set) { k, end };
+}
+
+struct btree_nr_keys
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
+			      struct sort_iter *);
+
+struct btree_nr_keys
+bch2_sort_repack(struct bset *, struct btree *,
+		 struct btree_node_iter *,
+		 struct bkey_format *, bool);
+
+unsigned bch2_sort_keys(struct bkey_packed *,
+			struct sort_iter *, bool);
+
+#endif /* _BCACHEFS_BKEY_SORT_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
new file mode 100644
index 000000000000..bb73ba9017b0
--- /dev/null
+++ b/fs/bcachefs/bset.c
@@ -0,0 +1,1592 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "bset.h"
+#include "eytzinger.h"
+#include "trace.h"
+#include "util.h"
+
+#include <asm/unaligned.h>
+#include <linux/console.h>
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+						  struct btree *);
+
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+	unsigned n = ARRAY_SIZE(iter->data);
+
+	while (n && __btree_node_iter_set_end(iter, n - 1))
+		--n;
+
+	return n;
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
+{
+	return bch2_bkey_to_bset_inlined(b, k);
+}
+
+/*
+ * There are never duplicate live keys in the btree - but including keys that
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
+ * duplicates.
+ *
+ * Thus the sort order is: usual key comparison first, but for keys that compare
+ * equal the deleted key(s) come first, and the (at most one) live version comes
+ * last.
+ *
+ * The main reason for this is insertion: to handle overwrites, we first iterate
+ * over keys that compare equal to our insert key, and then insert immediately
+ * prior to the first key greater than the key we're inserting - our insert
+ * position will be after all keys that compare equal to our insert key, which
+ * by the time we actually do the insert will all be deleted.
+ */
+
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
+		    struct bset *i, unsigned set)
+{
+	struct bkey_packed *_k, *_n;
+	struct bkey uk, n;
+	struct bkey_s_c k;
+	struct printbuf buf = PRINTBUF;
+
+	if (!i->u64s)
+		return;
+
+	for (_k = i->start;
+	     _k < vstruct_last(i);
+	     _k = _n) {
+		_n = bkey_p_next(_k);
+
+		k = bkey_disassemble(b, _k, &uk);
+
+		printbuf_reset(&buf);
+		if (c)
+			bch2_bkey_val_to_text(&buf, c, k);
+		else
+			bch2_bkey_to_text(&buf, k.k);
+		printk(KERN_ERR "block %u key %5zu: %s\n", set,
+		       _k->_data - i->_data, buf.buf);
+
+		if (_n == vstruct_last(i))
+			continue;
+
+		n = bkey_unpack_key(b, _n);
+
+		if (bpos_lt(n.p, k.k->p)) {
+			printk(KERN_ERR "Key skipped backwards\n");
+			continue;
+		}
+
+		if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p))
+			printk(KERN_ERR "Duplicate keys\n");
+	}
+
+	printbuf_exit(&buf);
+}
+
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	console_lock();
+	for_each_bset(b, t)
+		bch2_dump_bset(c, b, bset(b, t), t - b->set);
+	console_unlock();
+}
+
+void bch2_dump_btree_node_iter(struct btree *b,
+			      struct btree_node_iter *iter)
+{
+	struct btree_node_iter_set *set;
+	struct printbuf buf = PRINTBUF;
+
+	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+	       __btree_node_iter_used(iter), b->nsets);
+
+	btree_node_iter_for_each(iter, set) {
+		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+		struct bset_tree *t = bch2_bkey_to_bset(b, k);
+		struct bkey uk = bkey_unpack_key(b, k);
+
+		printbuf_reset(&buf);
+		bch2_bkey_to_text(&buf, &uk);
+		printk(KERN_ERR "set %zu key %u: %s\n",
+		       t - b->set, set->k, buf.buf);
+	}
+
+	printbuf_exit(&buf);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct bset_tree *t;
+	struct bkey_packed *k;
+	struct btree_nr_keys nr = { 0 };
+
+	for_each_bset(b, t)
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k))
+				btree_keys_account_key_add(&nr, t - b->set, k);
+
+	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
+}
+
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+					    struct btree *b)
+{
+	struct btree_node_iter iter = *_iter;
+	const struct bkey_packed *k, *n;
+
+	k = bch2_btree_node_iter_peek_all(&iter, b);
+	__bch2_btree_node_iter_advance(&iter, b);
+	n = bch2_btree_node_iter_peek_all(&iter, b);
+
+	bkey_unpack_key(b, k);
+
+	if (n &&
+	    bkey_iter_cmp(b, k, n) > 0) {
+		struct btree_node_iter_set *set;
+		struct bkey ku = bkey_unpack_key(b, k);
+		struct bkey nu = bkey_unpack_key(b, n);
+		struct printbuf buf1 = PRINTBUF;
+		struct printbuf buf2 = PRINTBUF;
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &ku);
+		bch2_bkey_to_text(&buf2, &nu);
+		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+		       buf1.buf, buf2.buf);
+		printk(KERN_ERR "iter was:");
+
+		btree_node_iter_for_each(_iter, set) {
+			struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+			struct bset_tree *t = bch2_bkey_to_bset(b, k2);
+			printk(" [%zi %zi]", t - b->set,
+			       k2->_data - bset(b, t)->_data);
+		}
+		panic("\n");
+	}
+}
+
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+				 struct btree *b)
+{
+	struct btree_node_iter_set *set, *s2;
+	struct bkey_packed *k, *p;
+	struct bset_tree *t;
+
+	if (bch2_btree_node_iter_end(iter))
+		return;
+
+	/* Verify no duplicates: */
+	btree_node_iter_for_each(iter, set) {
+		BUG_ON(set->k > set->end);
+		btree_node_iter_for_each(iter, s2)
+			BUG_ON(set != s2 && set->end == s2->end);
+	}
+
+	/* Verify that set->end is correct: */
+	btree_node_iter_for_each(iter, set) {
+		for_each_bset(b, t)
+			if (set->end == t->end_offset)
+				goto found;
+		BUG();
+found:
+		BUG_ON(set->k < btree_bkey_first_offset(t) ||
+		       set->k >= t->end_offset);
+	}
+
+	/* Verify iterator is sorted: */
+	btree_node_iter_for_each(iter, set)
+		BUG_ON(set != iter->data &&
+		       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+	k = bch2_btree_node_iter_peek_all(iter, b);
+
+	for_each_bset(b, t) {
+		if (iter->data[0].end == t->end_offset)
+			continue;
+
+		p = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+
+		BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+	}
+}
+
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+			    struct bkey_packed *insert, unsigned clobber_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, where);
+	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+	struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+#if 0
+	BUG_ON(prev &&
+	       bkey_iter_cmp(b, prev, insert) > 0);
+#else
+	if (prev &&
+	    bkey_iter_cmp(b, prev, insert) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, prev);
+		struct bkey k2 = bkey_unpack_key(b, insert);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("prev > insert:\n"
+		      "prev    key %s\n"
+		      "insert  key %s\n",
+		      buf1.buf, buf2.buf);
+	}
+#endif
+#if 0
+	BUG_ON(next != btree_bkey_last(b, t) &&
+	       bkey_iter_cmp(b, insert, next) > 0);
+#else
+	if (next != btree_bkey_last(b, t) &&
+	    bkey_iter_cmp(b, insert, next) > 0) {
+		struct bkey k1 = bkey_unpack_key(b, insert);
+		struct bkey k2 = bkey_unpack_key(b, next);
+
+		bch2_dump_btree_node(NULL, b);
+		bch2_bkey_to_text(&buf1, &k1);
+		bch2_bkey_to_text(&buf2, &k2);
+
+		panic("insert > next:\n"
+		      "insert  key %s\n"
+		      "next    key %s\n",
+		      buf1.buf, buf2.buf);
+	}
+#endif
+}
+
+#else
+
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
+						   struct btree *b) {}
+
+#endif
+
+/* Auxiliary search trees */
+
+#define BFLOAT_FAILED_UNPACKED	U8_MAX
+#define BFLOAT_FAILED		U8_MAX
+
+struct bkey_float {
+	u8		exponent;
+	u8		key_offset;
+	u16		mantissa;
+};
+#define BKEY_MANTISSA_BITS	16
+
+static unsigned bkey_float_byte_offset(unsigned idx)
+{
+	return idx * sizeof(struct bkey_float);
+}
+
+struct ro_aux_tree {
+	u8			nothing[0];
+	struct bkey_float	f[];
+};
+
+struct rw_aux_tree {
+	u16		offset;
+	struct bpos	k;
+};
+
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
+{
+	BUG_ON(t->aux_data_offset == U16_MAX);
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return t->aux_data_offset;
+	case BSET_RO_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
+				     t->size * sizeof(u8), 8);
+	case BSET_RW_AUX_TREE:
+		return t->aux_data_offset +
+			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
+	default:
+		BUG();
+	}
+}
+
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
+					const struct bset_tree *t)
+{
+	return t == b->set
+		? DIV_ROUND_UP(b->unpack_fn_len, 8)
+		: bset_aux_tree_buf_end(t - 1);
+}
+
+static void *__aux_tree_base(const struct btree *b,
+			     const struct bset_tree *t)
+{
+	return b->aux_data + t->aux_data_offset * 8;
+}
+
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
+					    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+static u8 *ro_aux_tree_prev(const struct btree *b,
+			    const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
+
+	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
+}
+
+static struct bkey_float *bkey_float(const struct btree *b,
+				     const struct bset_tree *t,
+				     unsigned idx)
+{
+	return ro_aux_tree_base(b, t)->f + idx;
+}
+
+static void bset_aux_tree_verify(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	const struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		if (t->aux_data_offset == U16_MAX)
+			continue;
+
+		BUG_ON(t != b->set &&
+		       t[-1].aux_data_offset == U16_MAX);
+
+		BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
+		BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
+		BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
+	}
+#endif
+}
+
+void bch2_btree_keys_init(struct btree *b)
+{
+	unsigned i;
+
+	b->nsets		= 0;
+	memset(&b->nr, 0, sizeof(b->nr));
+
+	for (i = 0; i < MAX_BSETS; i++)
+		b->set[i].data_offset = U16_MAX;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmetic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
+ * bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static inline void *bset_cacheline(const struct btree *b,
+				   const struct bset_tree *t,
+				   unsigned cacheline)
+{
+	return (void *) round_down((unsigned long) btree_bkey_first(b, t),
+				   L1_CACHE_BYTES) +
+		cacheline * BSET_CACHELINE;
+}
+
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned cacheline,
+					     unsigned offset)
+{
+	return bset_cacheline(b, t, cacheline) + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(const struct btree *b,
+				  const struct bset_tree *t,
+				  const struct bkey_packed *k)
+{
+	return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
+}
+
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
+					  const struct bset_tree *t,
+					  unsigned cacheline,
+					  const struct bkey_packed *k)
+{
+	return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
+}
+
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
+					 const struct bset_tree *t,
+					 unsigned cacheline,
+					 const struct bkey_packed *k)
+{
+	size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
+
+	EBUG_ON(m > U8_MAX);
+	return m;
+}
+
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
+					       const struct bset_tree *t,
+					       unsigned j)
+{
+	return cacheline_to_bkey(b, t,
+			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
+			bkey_float(b, t, j)->key_offset);
+}
+
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
+					     const struct bset_tree *t,
+					     unsigned j)
+{
+	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
+
+	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
+}
+
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
+				       const struct bset_tree *t)
+{
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+
+	return __aux_tree_base(b, t);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
+					  struct bset_tree *t,
+					  unsigned j)
+{
+	return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
+}
+
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
+			    unsigned j, struct bkey_packed *k)
+{
+	EBUG_ON(k >= btree_bkey_last(b, t));
+
+	rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
+		.offset	= __btree_node_key_to_offset(b, k),
+		.k	= bkey_unpack_pos(b, k),
+	};
+}
+
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
+					struct bset_tree *t)
+{
+	struct bkey_packed *k = btree_bkey_first(b, t);
+	unsigned j = 0;
+
+	if (!bch2_expensive_debug_checks)
+		return;
+
+	BUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	BUG_ON(t->size < 1);
+	BUG_ON(rw_aux_to_bkey(b, t, j) != k);
+
+	goto start;
+	while (1) {
+		if (rw_aux_to_bkey(b, t, j) == k) {
+			BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k,
+					bkey_unpack_pos(b, k)));
+start:
+			if (++j == t->size)
+				break;
+
+			BUG_ON(rw_aux_tree(b, t)[j].offset <=
+			       rw_aux_tree(b, t)[j - 1].offset);
+		}
+
+		k = bkey_p_next(k);
+		BUG_ON(k >= btree_bkey_last(b, t));
+	}
+}
+
+/* returns idx of first entry >= offset: */
+static unsigned rw_aux_tree_bsearch(struct btree *b,
+				    struct bset_tree *t,
+				    unsigned offset)
+{
+	unsigned bset_offs = offset - btree_bkey_first_offset(t);
+	unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+	unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
+
+	EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+	EBUG_ON(!t->size);
+	EBUG_ON(idx > t->size);
+
+	while (idx < t->size &&
+	       rw_aux_tree(b, t)[idx].offset < offset)
+		idx++;
+
+	while (idx &&
+	       rw_aux_tree(b, t)[idx - 1].offset >= offset)
+		idx--;
+
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset < offset);
+	EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+	EBUG_ON(idx + 1 < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx + 1].offset);
+
+	return idx;
+}
+
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
+				     const struct bkey_float *f,
+				     unsigned idx)
+{
+	u64 v;
+
+	EBUG_ON(!bkey_packed(k));
+
+	v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
+
+	/*
+	 * In little endian, we're shifting off low bits (and then the bits we
+	 * want are at the low end), in big endian we're shifting off high bits
+	 * (and then the bits we want are at the high end, so we shift them
+	 * back down):
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	v >>= f->exponent & 7;
+#else
+	v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
+#endif
+	return (u16) v;
+}
+
+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
+					unsigned j,
+					struct bkey_packed *min_key,
+					struct bkey_packed *max_key)
+{
+	struct bkey_float *f = bkey_float(b, t, j);
+	struct bkey_packed *m = tree_to_bkey(b, t, j);
+	struct bkey_packed *l = is_power_of_2(j)
+		? min_key
+		: tree_to_prev_bkey(b, t, j >> ffs(j));
+	struct bkey_packed *r = is_power_of_2(j + 1)
+		? max_key
+		: tree_to_bkey(b, t, j >> (ffz(j) + 1));
+	unsigned mantissa;
+	int shift, exponent, high_bit;
+
+	/*
+	 * for failed bfloats, the lookup code falls back to comparing against
+	 * the original key.
+	 */
+
+	if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
+	    !b->nr_key_bits) {
+		f->exponent = BFLOAT_FAILED_UNPACKED;
+		return;
+	}
+
+	/*
+	 * The greatest differing bit of l and r is the first bit we must
+	 * include in the bfloat mantissa we're creating in order to do
+	 * comparisons - that bit always becomes the high bit of
+	 * bfloat->mantissa, and thus the exponent we're calculating here is
+	 * the position of what will become the low bit in bfloat->mantissa:
+	 *
+	 * Note that this may be negative - we may be running off the low end
+	 * of the key: we handle this later:
+	 */
+	high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
+		       min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
+	exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
+
+	/*
+	 * Then we calculate the actual shift value, from the start of the key
+	 * (k->_data), to get the key bits starting at exponent:
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
+
+	EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
+#else
+	shift = high_bit_offset +
+		b->nr_key_bits -
+		exponent -
+		BKEY_MANTISSA_BITS;
+
+	EBUG_ON(shift < KEY_PACKED_BITS_START);
+#endif
+	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
+
+	f->exponent = shift;
+	mantissa = bkey_mantissa(m, f, j);
+
+	/*
+	 * If we've got garbage bits, set them to all 1s - it's legal for the
+	 * bfloat to compare larger than the original key, but not smaller:
+	 */
+	if (exponent < 0)
+		mantissa |= ~(~0U << -exponent);
+
+	f->mantissa = mantissa;
+}
+
+/* bytes remaining - only valid for last bset: */
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	bset_aux_tree_verify(b);
+
+	return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
+}
+
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) /
+		(sizeof(struct bkey_float) + sizeof(u8));
+}
+
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
+{
+	return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
+}
+
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *k;
+
+	t->size = 1;
+	t->extra = BSET_RW_AUX_TREE_VAL;
+	rw_aux_tree(b, t)[0].offset =
+		__btree_node_key_to_offset(b, btree_bkey_first(b, t));
+
+	bset_tree_for_each_key(b, t, k) {
+		if (t->size == bset_rw_tree_capacity(b, t))
+			break;
+
+		if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
+		    L1_CACHE_BYTES)
+			rw_aux_tree_set(b, t, t->size++, k);
+	}
+}
+
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_i min_key, max_key;
+	unsigned j, cacheline = 1;
+
+	t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
+		      bset_ro_tree_capacity(b, t));
+retry:
+	if (t->size < 2) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	eytzinger1_for_each(j, t->size - 1) {
+		while (bkey_to_cacheline(b, t, k) < cacheline)
+			prev = k, k = bkey_p_next(k);
+
+		if (k >= btree_bkey_last(b, t)) {
+			/* XXX: this path sucks */
+			t->size--;
+			goto retry;
+		}
+
+		ro_aux_tree_prev(b, t)[j] = prev->u64s;
+		bkey_float(b, t, j)->key_offset =
+			bkey_to_cacheline_offset(b, t, cacheline++, k);
+
+		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
+		EBUG_ON(tree_to_bkey(b, t, j) != k);
+	}
+
+	while (k != btree_bkey_last(b, t))
+		prev = k, k = bkey_p_next(k);
+
+	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+		bkey_init(&min_key.k);
+		min_key.k.p = b->data->min_key;
+	}
+
+	if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+		bkey_init(&max_key.k);
+		max_key.k.p = b->data->max_key;
+	}
+
+	/* Then we build the tree */
+	eytzinger1_for_each(j, t->size - 1)
+		make_bfloat(b, t, j,
+			    bkey_to_packed(&min_key),
+			    bkey_to_packed(&max_key));
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	struct bset_tree *i;
+
+	for (i = b->set; i != t; i++)
+		BUG_ON(bset_has_rw_aux_tree(i));
+
+	bch2_bset_set_no_aux_tree(b, t);
+
+	/* round up to next cacheline: */
+	t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
+				      SMP_CACHE_BYTES / sizeof(u64));
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
+			     bool writeable)
+{
+	if (writeable
+	    ? bset_has_rw_aux_tree(t)
+	    : bset_has_ro_aux_tree(t))
+		return;
+
+	bset_alloc_tree(b, t);
+
+	if (!__bset_tree_capacity(b, t))
+		return;
+
+	if (writeable)
+		__build_rw_aux_tree(b, t);
+	else
+		__build_ro_aux_tree(b, t);
+
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_init_first(struct btree *b, struct bset *i)
+{
+	struct bset_tree *t;
+
+	BUG_ON(b->nsets);
+
+	memset(i, 0, sizeof(*i));
+	get_random_bytes(&i->seq, sizeof(i->seq));
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+			 struct btree_node_entry *bne)
+{
+	struct bset *i = &bne->keys;
+	struct bset_tree *t;
+
+	BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+	BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	memset(i, 0, sizeof(*i));
+	i->seq = btree_bset_first(b)->seq;
+	SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+	t = &b->set[b->nsets++];
+	set_btree_bset(b, t, i);
+}
+
+/*
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
+ * immediate predecessor:
+ */
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
+				       struct bkey_packed *k)
+{
+	struct bkey_packed *p;
+	unsigned offset;
+	int j;
+
+	EBUG_ON(k < btree_bkey_first(b, t) ||
+		k > btree_bkey_last(b, t));
+
+	if (k == btree_bkey_first(b, t))
+		return NULL;
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		p = btree_bkey_first(b, t);
+		break;
+	case BSET_RO_AUX_TREE:
+		j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
+
+		do {
+			p = j ? tree_to_bkey(b, t,
+					__inorder_to_eytzinger1(j--,
+							t->size - 1, t->extra))
+			      : btree_bkey_first(b, t);
+		} while (p >= k);
+		break;
+	case BSET_RW_AUX_TREE:
+		offset = __btree_node_key_to_offset(b, k);
+		j = rw_aux_tree_bsearch(b, t, offset);
+		p = j ? rw_aux_to_bkey(b, t, j - 1)
+		      : btree_bkey_first(b, t);
+		break;
+	}
+
+	return p;
+}
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
+					  struct bset_tree *t,
+					  struct bkey_packed *k,
+					  unsigned min_key_type)
+{
+	struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
+
+	while ((p = __bkey_prev(b, t, k)) && !ret) {
+		for (i = p; i != k; i = bkey_p_next(i))
+			if (i->type >= min_key_type)
+				ret = i;
+
+		k = p;
+	}
+
+	if (bch2_expensive_debug_checks) {
+		BUG_ON(ret >= orig_k);
+
+		for (i = ret
+			? bkey_p_next(ret)
+			: btree_bkey_first(b, t);
+		     i != orig_k;
+		     i = bkey_p_next(i))
+			BUG_ON(i->type >= min_key_type);
+	}
+
+	return ret;
+}
+
+/* Insert */
+
+static void bch2_bset_fix_lookup_table(struct btree *b,
+				       struct bset_tree *t,
+				       struct bkey_packed *_where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+
+	EBUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	/* returns first entry >= where */
+	l = rw_aux_tree_bsearch(b, t, where);
+
+	if (!l) /* never delete first entry */
+		l++;
+	else if (l < t->size &&
+		 where < t->end_offset &&
+		 rw_aux_tree(b, t)[l].offset == where)
+		rw_aux_tree_set(b, t, l++, _where);
+
+	/* l now > where */
+
+	for (j = l;
+	     j < t->size &&
+	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
+	     j++)
+		;
+
+	if (j < t->size &&
+	    rw_aux_tree(b, t)[j].offset + shift ==
+	    rw_aux_tree(b, t)[l - 1].offset)
+		j++;
+
+	memmove(&rw_aux_tree(b, t)[l],
+		&rw_aux_tree(b, t)[j],
+		(void *) &rw_aux_tree(b, t)[t->size] -
+		(void *) &rw_aux_tree(b, t)[j]);
+	t->size -= j - l;
+
+	for (j = l; j < t->size; j++)
+		rw_aux_tree(b, t)[j].offset += shift;
+
+	EBUG_ON(l < t->size &&
+		rw_aux_tree(b, t)[l].offset ==
+		rw_aux_tree(b, t)[l - 1].offset);
+
+	if (t->size < bset_rw_tree_capacity(b, t) &&
+	    (l < t->size
+	     ? rw_aux_tree(b, t)[l].offset
+	     : t->end_offset) -
+	    rw_aux_tree(b, t)[l - 1].offset >
+	    L1_CACHE_BYTES / sizeof(u64)) {
+		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
+		struct bkey_packed *end = l < t->size
+			? rw_aux_to_bkey(b, t, l)
+			: btree_bkey_last(b, t);
+		struct bkey_packed *k = start;
+
+		while (1) {
+			k = bkey_p_next(k);
+			if (k == end)
+				break;
+
+			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+				memmove(&rw_aux_tree(b, t)[l + 1],
+					&rw_aux_tree(b, t)[l],
+					(void *) &rw_aux_tree(b, t)[t->size] -
+					(void *) &rw_aux_tree(b, t)[l]);
+				t->size++;
+				rw_aux_tree_set(b, t, l, k);
+				break;
+			}
+		}
+	}
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bset_aux_tree_verify(b);
+}
+
+void bch2_bset_insert(struct btree *b,
+		      struct btree_node_iter *iter,
+		      struct bkey_packed *where,
+		      struct bkey_i *insert,
+		      unsigned clobber_u64s)
+{
+	struct bkey_format *f = &b->format;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bkey_packed packed, *src = bkey_to_packed(insert);
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+	bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
+
+	if (bch2_bkey_pack_key(&packed, &insert->k, f))
+		src = &packed;
+
+	if (!bkey_deleted(&insert->k))
+		btree_keys_account_key_add(&b->nr, t - b->set, src);
+
+	if (src->u64s != clobber_u64s) {
+		u64 *src_p = (u64 *) where->_data + clobber_u64s;
+		u64 *dst_p = (u64 *) where->_data + src->u64s;
+
+		EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
+			(int) clobber_u64s - src->u64s);
+
+		memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+		le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
+		set_btree_bset_end(b, t);
+	}
+
+	memcpy_u64s_small(where, src,
+		    bkeyp_key_u64s(f, src));
+	memcpy_u64s(bkeyp_val(f, where), &insert->v,
+		    bkeyp_val_u64s(f, src));
+
+	if (src->u64s != clobber_u64s)
+		bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_bset_delete(struct btree *b,
+		      struct bkey_packed *where,
+		      unsigned clobber_u64s)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	u64 *src_p = (u64 *) where->_data + clobber_u64s;
+	u64 *dst_p = where->_data;
+
+	bch2_bset_verify_rw_aux_tree(b, t);
+
+	EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
+
+	memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
+	le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
+	set_btree_bset_end(b, t);
+
+	bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
+}
+
+/* Lookup */
+
+__flatten
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search)
+{
+	unsigned l = 0, r = t->size;
+
+	while (l + 1 != r) {
+		unsigned m = (l + r) >> 1;
+
+		if (bpos_lt(rw_aux_tree(b, t)[m].k, *search))
+			l = m;
+		else
+			r = m;
+	}
+
+	return rw_aux_to_bkey(b, t, l);
+}
+
+static inline void prefetch_four_cachelines(void *p)
+{
+#ifdef CONFIG_X86_64
+	asm("prefetcht0 (-127 + 64 * 0)(%0);"
+	    "prefetcht0 (-127 + 64 * 1)(%0);"
+	    "prefetcht0 (-127 + 64 * 2)(%0);"
+	    "prefetcht0 (-127 + 64 * 3)(%0);"
+	    :
+	    : "r" (p + 127));
+#else
+	prefetch(p + L1_CACHE_BYTES * 0);
+	prefetch(p + L1_CACHE_BYTES * 1);
+	prefetch(p + L1_CACHE_BYTES * 2);
+	prefetch(p + L1_CACHE_BYTES * 3);
+#endif
+}
+
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
+					      const struct bkey_float *f,
+					      unsigned idx)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
+
+	return f->exponent > key_bits_start;
+#else
+	unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
+
+	return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
+#endif
+}
+
+__flatten
+static struct bkey_packed *bset_search_tree(const struct btree *b,
+				const struct bset_tree *t,
+				const struct bpos *search,
+				const struct bkey_packed *packed_search)
+{
+	struct ro_aux_tree *base = ro_aux_tree_base(b, t);
+	struct bkey_float *f;
+	struct bkey_packed *k;
+	unsigned inorder, n = 1, l, r;
+	int cmp;
+
+	do {
+		if (likely(n << 4 < t->size))
+			prefetch(&base->f[n << 4]);
+
+		f = &base->f[n];
+		if (unlikely(f->exponent >= BFLOAT_FAILED))
+			goto slowpath;
+
+		l = f->mantissa;
+		r = bkey_mantissa(packed_search, f, n);
+
+		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+			goto slowpath;
+
+		n = n * 2 + (l < r);
+		continue;
+slowpath:
+		k = tree_to_bkey(b, t, n);
+		cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
+		if (!cmp)
+			return k;
+
+		n = n * 2 + (cmp < 0);
+	} while (n < t->size);
+
+	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (likely(!(n & 1))) {
+		--inorder;
+		if (unlikely(!inorder))
+			return btree_bkey_first(b, t);
+
+		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
+	}
+
+	return cacheline_to_bkey(b, t, inorder, f->key_offset);
+}
+
+static __always_inline __flatten
+struct bkey_packed *__bch2_bset_search(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				const struct bkey_packed *lossy_packed_search)
+{
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	switch (bset_aux_tree_type(t)) {
+	case BSET_NO_AUX_TREE:
+		return btree_bkey_first(b, t);
+	case BSET_RW_AUX_TREE:
+		return bset_search_write_set(b, t, search);
+	case BSET_RO_AUX_TREE:
+		return bset_search_tree(b, t, search, lossy_packed_search);
+	default:
+		BUG();
+	}
+}
+
+static __always_inline __flatten
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
+				struct bset_tree *t,
+				struct bpos *search,
+				struct bkey_packed *packed_search,
+				const struct bkey_packed *lossy_packed_search,
+				struct bkey_packed *m)
+{
+	if (lossy_packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_cmp_p_or_unp(b, m,
+					lossy_packed_search, search) < 0)
+			m = bkey_p_next(m);
+
+	if (!packed_search)
+		while (m != btree_bkey_last(b, t) &&
+		       bkey_iter_pos_cmp(b, m, search) < 0)
+			m = bkey_p_next(m);
+
+	if (bch2_expensive_debug_checks) {
+		struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
+
+		BUG_ON(prev &&
+		       bkey_iter_cmp_p_or_unp(b, prev,
+					packed_search, search) >= 0);
+	}
+
+	return m;
+}
+
+/* Btree node iterator */
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			       struct btree *b,
+			       const struct bkey_packed *k,
+			       const struct bkey_packed *end)
+{
+	__bch2_btree_node_iter_push(iter, b, k, end);
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+noinline __flatten __cold
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
+			      struct btree *b, struct bpos *search)
+{
+	struct bkey_packed *k;
+
+	trace_bkey_pack_pos_fail(search);
+
+	bch2_btree_node_iter_init_from_start(iter, b);
+
+	while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, search) < 0)
+		bch2_btree_node_iter_advance(iter, b);
+}
+
+/**
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * given position
+ *
+ * @iter:	iterator to initialize
+ * @b:		btree node to search
+ * @search:	search key
+ *
+ * Main entry point to the lookup code for individual btree nodes:
+ *
+ * NOTE:
+ *
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
+ * keys. This doesn't matter for most code, but it does matter for lookups.
+ *
+ * Some adjacent keys with a string of equal keys:
+ *	i j k k k k l m
+ *
+ * If you search for k, the lookup code isn't guaranteed to return you any
+ * specific k. The lookup code is conceptually doing a binary search and
+ * iterating backwards is very expensive so if the pivot happens to land at the
+ * last k that's what you'll get.
+ *
+ * This works out ok, but it's something to be aware of:
+ *
+ *  - For non extents, we guarantee that the live key comes last - see
+ *    btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
+ *    see will only be deleted keys you don't care about.
+ *
+ *  - For extents, deleted keys sort last (see the comment at the top of this
+ *    file). But when you're searching for extents, you actually want the first
+ *    key strictly greater than your search key - an extent that compares equal
+ *    to the search key is going to have 0 sectors after the search key.
+ *
+ *    But this does mean that we can't just search for
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
+ *    the range we want - if we're unlucky and there's an extent that ends
+ *    exactly where we searched, then there could be a deleted key at the same
+ *    position and we'd get that when we search instead of the preceding extent
+ *    we needed.
+ *
+ *    So we've got to search for start_of_range, then after the lookup iterate
+ *    past any extents that compare equal to the position we searched for.
+ */
+__flatten
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
+			       struct btree *b, struct bpos *search)
+{
+	struct bkey_packed p, *packed_search = NULL;
+	struct btree_node_iter_set *pos = iter->data;
+	struct bkey_packed *k[MAX_BSETS];
+	unsigned i;
+
+	EBUG_ON(bpos_lt(*search, b->data->min_key));
+	EBUG_ON(bpos_gt(*search, b->data->max_key));
+	bset_aux_tree_verify(b);
+
+	memset(iter, 0, sizeof(*iter));
+
+	switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
+	case BKEY_PACK_POS_EXACT:
+		packed_search = &p;
+		break;
+	case BKEY_PACK_POS_SMALLER:
+		packed_search = NULL;
+		break;
+	case BKEY_PACK_POS_FAIL:
+		btree_node_iter_init_pack_failed(iter, b, search);
+		return;
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		k[i] = __bch2_bset_search(b, b->set + i, search, &p);
+		prefetch_four_cachelines(k[i]);
+	}
+
+	for (i = 0; i < b->nsets; i++) {
+		struct bset_tree *t = b->set + i;
+		struct bkey_packed *end = btree_bkey_last(b, t);
+
+		k[i] = bch2_bset_search_linear(b, t, search,
+					       packed_search, &p, k[i]);
+		if (k[i] != end)
+			*pos++ = (struct btree_node_iter_set) {
+				__btree_node_key_to_offset(b, k[i]),
+				__btree_node_key_to_offset(b, end)
+			};
+	}
+
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
+					  struct btree *b)
+{
+	struct bset_tree *t;
+
+	memset(iter, 0, sizeof(*iter));
+
+	for_each_bset(b, t)
+		__bch2_btree_node_iter_push(iter, b,
+					   btree_bkey_first(b, t),
+					   btree_bkey_last(b, t));
+	bch2_btree_node_iter_sort(iter, b);
+}
+
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
+						  struct btree *b,
+						  struct bset_tree *t)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset)
+			return __btree_node_offset_to_key(b, set->k);
+
+	return btree_bkey_last(b, t);
+}
+
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
+					    struct btree *b,
+					    unsigned first)
+{
+	bool ret;
+
+	if ((ret = (btree_node_iter_cmp(b,
+					iter->data[first],
+					iter->data[first + 1]) > 0)))
+		swap(iter->data[first], iter->data[first + 1]);
+	return ret;
+}
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
+			       struct btree *b)
+{
+	/* unrolled bubble sort: */
+
+	if (!__btree_node_iter_set_end(iter, 2)) {
+		btree_node_iter_sort_two(iter, b, 0);
+		btree_node_iter_sort_two(iter, b, 1);
+	}
+
+	if (!__btree_node_iter_set_end(iter, 1))
+		btree_node_iter_sort_two(iter, b, 0);
+}
+
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
+				   struct btree_node_iter_set *set)
+{
+	struct btree_node_iter_set *last =
+		iter->data + ARRAY_SIZE(iter->data) - 1;
+
+	memmove(&set[0], &set[1], (void *) last - (void *) set);
+	*last = (struct btree_node_iter_set) { 0, 0 };
+}
+
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
+
+	EBUG_ON(iter->data->k > iter->data->end);
+
+	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
+		/* avoid an expensive memmove call: */
+		iter->data[0] = iter->data[1];
+		iter->data[1] = iter->data[2];
+		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
+		return;
+	}
+
+	if (__btree_node_iter_set_end(iter, 1))
+		return;
+
+	if (!btree_node_iter_sort_two(iter, b, 0))
+		return;
+
+	if (__btree_node_iter_set_end(iter, 2))
+		return;
+
+	btree_node_iter_sort_two(iter, b, 1);
+}
+
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
+				  struct btree *b)
+{
+	if (bch2_expensive_debug_checks) {
+		bch2_btree_node_iter_verify(iter, b);
+		bch2_btree_node_iter_next_check(iter, b);
+	}
+
+	__bch2_btree_node_iter_advance(iter, b);
+}
+
+/*
+ * Expensive:
+ */
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+						  struct btree *b)
+{
+	struct bkey_packed *k, *prev = NULL;
+	struct btree_node_iter_set *set;
+	struct bset_tree *t;
+	unsigned end = 0;
+
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
+
+	for_each_bset(b, t) {
+		k = bch2_bkey_prev_all(b, t,
+			bch2_btree_node_iter_bset_pos(iter, b, t));
+		if (k &&
+		    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
+			prev = k;
+			end = t->end_offset;
+		}
+	}
+
+	if (!prev)
+		return NULL;
+
+	/*
+	 * We're manually memmoving instead of just calling sort() to ensure the
+	 * prev we picked ends up in slot 0 - sort won't necessarily put it
+	 * there because of duplicate deleted keys:
+	 */
+	btree_node_iter_for_each(iter, set)
+		if (set->end == end)
+			goto found;
+
+	BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
+found:
+	BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
+
+	memmove(&iter->data[1],
+		&iter->data[0],
+		(void *) set - (void *) &iter->data[0]);
+
+	iter->data[0].k = __btree_node_key_to_offset(b, prev);
+	iter->data[0].end = end;
+
+	if (bch2_expensive_debug_checks)
+		bch2_btree_node_iter_verify(iter, b);
+	return prev;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+					      struct btree *b)
+{
+	struct bkey_packed *prev;
+
+	do {
+		prev = bch2_btree_node_iter_prev_all(iter, b);
+	} while (prev && bkey_deleted(prev));
+
+	return prev;
+}
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
+						 struct btree *b,
+						 struct bkey *u)
+{
+	struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
+
+	return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
+}
+
+/* Mergesort */
+
+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats)
+{
+	const struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		enum bset_aux_tree_type type = bset_aux_tree_type(t);
+		size_t j;
+
+		stats->sets[type].nr++;
+		stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
+			sizeof(u64);
+
+		if (bset_has_ro_aux_tree(t)) {
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				stats->failed +=
+					bkey_float(b, t, j)->exponent ==
+					BFLOAT_FAILED;
+		}
+	}
+}
+
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
+			 struct bkey_packed *k)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+	struct bkey uk;
+	unsigned j, inorder;
+
+	if (!bset_has_ro_aux_tree(t))
+		return;
+
+	inorder = bkey_to_cacheline(b, t, k);
+	if (!inorder || inorder >= t->size)
+		return;
+
+	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
+	if (k != tree_to_bkey(b, t, j))
+		return;
+
+	switch (bkey_float(b, t, j)->exponent) {
+	case BFLOAT_FAILED:
+		uk = bkey_unpack_key(b, k);
+		prt_printf(out,
+		       "    failed unpacked at depth %u\n"
+		       "\t",
+		       ilog2(j));
+		bch2_bpos_to_text(out, uk.p);
+		prt_printf(out, "\n");
+		break;
+	}
+}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
new file mode 100644
index 000000000000..632c2b8c5460
--- /dev/null
+++ b/fs/bcachefs/bset.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		256
+
+static inline size_t btree_keys_cachelines(const struct btree *b)
+{
+	return (1U << b->byte_order) / BSET_CACHELINE;
+}
+
+static inline size_t btree_aux_data_bytes(const struct btree *b)
+{
+	return btree_keys_cachelines(b) * 8;
+}
+
+static inline size_t btree_aux_data_u64s(const struct btree *b)
+{
+	return btree_aux_data_bytes(b) / sizeof(u64);
+}
+
+#define for_each_bset(_b, _t)						\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+#define bset_tree_for_each_key(_b, _t, _k)				\
+	for (_k = btree_bkey_first(_b, _t);				\
+	     _k != btree_bkey_last(_b, _t);				\
+	     _k = bkey_p_next(_k))
+
+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch2_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_init(struct btree *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bpos_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+static inline struct bset_tree *
+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k)
+{
+	unsigned offset = __btree_node_key_to_offset(b, k);
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (offset <= t->end_offset) {
+			EBUG_ON(offset < btree_bkey_first_offset(t));
+			return t;
+		}
+
+	BUG();
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+					  struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 1);
+}
+
+/* Btree key iteration */
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			       struct bpos *);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+					  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+				   struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)				\
+	for (_set = (_iter)->data;					\
+	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
+	     (_set)->k != (_set)->end;					\
+	     _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+					     unsigned i)
+{
+	return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return __btree_node_iter_set_end(iter, 0);
+}
+
+/*
+ * When keys compare equal, deleted keys compare first:
+ *
+ * XXX: only need to compare pointers for keys that are both within a
+ * btree_node_iterator - we need to break ties for prev() to work correctly
+ */
+static inline int bkey_iter_cmp(const struct btree *b,
+				const struct bkey_packed *l,
+				const struct bkey_packed *r)
+{
+	return bch2_bkey_cmp_packed(b, l, r)
+		?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
+		?: cmp_int(l, r);
+}
+
+static inline int btree_node_iter_cmp(const struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return bkey_iter_cmp(b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+			const struct bkey_packed *l,
+			const struct bpos *r)
+{
+	return bkey_cmp_left_packed(b, l, r)
+		?: -((int) bkey_deleted(l));
+}
+
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    const struct bpos *r)
+{
+	return bkey_cmp_p_or_unp(b, l, r_packed, r)
+		?: -((int) bkey_deleted(l));
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
+{
+	return !bch2_btree_node_iter_end(iter)
+		? __btree_node_offset_to_key(b, iter->data->k)
+		: NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *k;
+
+	while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+	       bkey_deleted(k))
+		bch2_btree_node_iter_advance(iter, b);
+
+	return k;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+						  struct btree *);
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+					      struct btree *);
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key(b, k, iter)				\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek((iter), (b)));		\
+	     bch2_btree_node_iter_advance(iter, b))
+
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b));		\
+	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+static inline void btree_keys_account_val_delta(struct btree *b,
+						struct bkey_packed *k,
+						int delta)
+{
+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+	b->nr.live_u64s			+= delta;
+	b->nr.bset_u64s[t - b->set]	+= delta;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+#define btree_account_key_add(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)				\
+	btree_keys_account_key(&(_b)->nr,			\
+		bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed;
+};
+
+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *);
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
+			 struct bkey_packed *);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+			    struct bkey_packed *, unsigned);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+					  struct bkey_packed *where,
+					  struct bkey_packed *insert,
+					  unsigned clobber_u64s) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+	if (bch2_debug_check_btree_accounting)
+		__bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
new file mode 100644
index 000000000000..79495cd7a794
--- /dev/null
+++ b/fs/bcachefs/btree_cache.c
@@ -0,0 +1,1215 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
+
+const char * const bch2_btree_node_flags[] = {
+#define x(f)	#f,
+	BTREE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots_known[0].b)
+		reserve += 8;
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			reserve += min_t(unsigned, 1, r->b->c.level) * 8;
+	}
+
+	c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+	return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+	if (b->c.lock.readers)
+		list_move(&b->list, &bc->freed_pcpu);
+	else
+		list_move(&b->list, &bc->freed_nonpcpu);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	clear_btree_node_just_written(b);
+
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+#ifdef __KERNEL__
+	kvfree(b->aux_data);
+#else
+	munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
+	b->aux_data = NULL;
+
+	bc->used--;
+
+	btree_node_to_freedlist(bc, b);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				   const void *obj)
+{
+	const struct btree *b = obj;
+	const u64 *v = arg->key;
+
+	return b->hash_val == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, hash_val),
+	.key_len	= sizeof(u64),
+	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+};
+
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	BUG_ON(b->data || b->aux_data);
+
+	b->data = kvpmalloc(btree_bytes(c), gfp);
+	if (!b->data)
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+#ifdef __KERNEL__
+	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
+#else
+	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+			   PROT_READ|PROT_WRITE|PROT_EXEC,
+			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	if (b->aux_data == MAP_FAILED)
+		b->aux_data = NULL;
+#endif
+	if (!b->aux_data) {
+		kvpfree(b->data, btree_bytes(c));
+		b->data = NULL;
+		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+	}
+
+	return 0;
+}
+
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+	struct btree *b;
+
+	b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	bkey_btree_ptr_init(&b->key);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+	b->byte_order = ilog2(btree_bytes(c));
+	return b;
+}
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = __btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!b)
+		return NULL;
+
+	if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
+		kfree(b);
+		return NULL;
+	}
+
+	bch2_btree_lock_init(&b->c, 0);
+
+	bc->used++;
+	list_add(&b->list, &bc->freeable);
+	return b;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+	BUG_ON(ret);
+
+	/* Cause future lookups for this node to fail: */
+	b->hash_val = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+	BUG_ON(b->hash_val);
+	b->hash_val = btree_ptr_hash_val(&b->key);
+
+	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+					     bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+				unsigned level, enum btree_id id)
+{
+	int ret;
+
+	b->c.level	= level;
+	b->c.btree_id	= id;
+
+	mutex_lock(&bc->lock);
+	ret = __bch2_btree_node_hash_insert(bc, b);
+	if (!ret)
+		list_add_tail(&b->list, &bc->live);
+	mutex_unlock(&bc->lock);
+
+	return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+				     const struct bkey_i *k)
+{
+	u64 v = btree_ptr_hash_val(k);
+
+	return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	int ret = 0;
+
+	lockdep_assert_held(&bc->lock);
+wait_on_io:
+	if (b->flags & ((1U << BTREE_NODE_dirty)|
+			(1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+		/* XXX: waiting on IO with btree cache lock held */
+		bch2_btree_node_wait_on_read(b);
+		bch2_btree_node_wait_on_write(b);
+	}
+
+	if (!six_trylock_intent(&b->c.lock))
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
+	if (!six_trylock_write(&b->c.lock))
+		goto out_unlock_intent;
+
+	/* recheck under lock */
+	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+			(1U << BTREE_NODE_write_in_flight))) {
+		if (!flush)
+			goto out_unlock;
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
+	if (btree_node_noevict(b) ||
+	    btree_node_write_blocked(b) ||
+	    btree_node_will_make_reachable(b))
+		goto out_unlock;
+
+	if (btree_node_dirty(b)) {
+		if (!flush)
+			goto out_unlock;
+		/*
+		 * Using the underscore version because we don't want to compact
+		 * bsets after the write, since this node is about to be evicted
+		 * - unless btree verify mode is enabled, since it runs out of
+		 * the post write cleanup:
+		 */
+		if (bch2_verify_btree_ondisk)
+			bch2_btree_node_write(c, b, SIX_LOCK_intent,
+					      BTREE_WRITE_cache_reclaim);
+		else
+			__bch2_btree_node_write(c, b,
+						BTREE_WRITE_cache_reclaim);
+
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+out:
+	if (b->hash_val && !ret)
+		trace_and_count(c, btree_cache_reap, c, b);
+	return ret;
+out_unlock:
+	six_unlock_write(&b->c.lock);
+out_unlock_intent:
+	six_unlock_intent(&b->c.lock);
+	ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
+	goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free = 0;
+	unsigned long freed = 0;
+	unsigned long touched = 0;
+	unsigned i, flags;
+	unsigned long ret = SHRINK_STOP;
+	bool trigger_writes = atomic_read(&bc->dirty) + nr >=
+		bc->used * 3 / 4;
+
+	if (bch2_btree_shrinker_disabled)
+		return SHRINK_STOP;
+
+	mutex_lock(&bc->lock);
+	flags = memalloc_nofs_save();
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	can_free = btree_cache_can_free(bc);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		/*
+		 * Leave a few nodes on the freeable list, so that a btree split
+		 * won't have to hit the system allocator:
+		 */
+		if (++i <= 3)
+			continue;
+
+		touched++;
+
+		if (touched >= nr)
+			goto out;
+
+		if (!btree_node_reclaim(c, b)) {
+			btree_node_data_free(c, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &bc->live, list) {
+		touched++;
+
+		if (btree_node_accessed(b)) {
+			clear_btree_node_accessed(b);
+		} else if (!btree_node_reclaim(c, b)) {
+			freed++;
+			btree_node_data_free(c, b);
+
+			bch2_btree_node_hash_remove(bc, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+
+			if (freed == nr)
+				goto out_rotate;
+		} else if (trigger_writes &&
+			   btree_node_dirty(b) &&
+			   !btree_node_will_make_reachable(b) &&
+			   !btree_node_write_blocked(b) &&
+			   six_trylock_read(&b->c.lock)) {
+			list_move(&bc->live, &b->list);
+			mutex_unlock(&bc->lock);
+			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+			six_unlock_read(&b->c.lock);
+			if (touched >= nr)
+				goto out_nounlock;
+			mutex_lock(&bc->lock);
+			goto restart;
+		}
+
+		if (touched >= nr)
+			break;
+	}
+out_rotate:
+	if (&t->list != &bc->live)
+		list_move_tail(&bc->live, &t->list);
+out:
+	mutex_unlock(&bc->lock);
+out_nounlock:
+	ret = freed;
+	memalloc_nofs_restore(flags);
+	trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
+	return ret;
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bch2_btree_shrinker_disabled)
+		return 0;
+
+	return btree_cache_can_free(bc);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	unsigned i, flags;
+
+	shrinker_free(bc->shrink);
+
+	/* vfree() can allocate memory: */
+	flags = memalloc_nofs_save();
+	mutex_lock(&bc->lock);
+
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &bc->live);
+
+	kvpfree(c->verify_ondisk, btree_bytes(c));
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->b)
+			list_add(&r->b->list, &bc->live);
+	}
+
+	list_splice(&bc->freeable, &bc->live);
+
+	while (!list_empty(&bc->live)) {
+		b = list_first_entry(&bc->live, struct btree, list);
+
+		BUG_ON(btree_node_read_in_flight(b) ||
+		       btree_node_write_in_flight(b));
+
+		btree_node_data_free(c, b);
+	}
+
+	BUG_ON(!bch2_journal_error(&c->journal) &&
+	       atomic_read(&c->btree_cache.dirty));
+
+	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+	while (!list_empty(&bc->freed_nonpcpu)) {
+		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+		list_del(&b->list);
+		six_lock_exit(&b->c.lock);
+		kfree(b);
+	}
+
+	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct shrinker *shrink;
+	unsigned i;
+	int ret = 0;
+
+	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+	if (ret)
+		goto err;
+
+	bc->table_init_done = true;
+
+	bch2_recalc_btree_reserve(c);
+
+	for (i = 0; i < bc->reserve; i++)
+		if (!__bch2_btree_node_mem_alloc(c))
+			goto err;
+
+	list_splice_init(&bc->live, &bc->freeable);
+
+	mutex_init(&c->verify_lock);
+
+	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+	if (!shrink)
+		goto err;
+	bc->shrink = shrink;
+	shrink->count_objects	= bch2_btree_cache_count;
+	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->seeks		= 4;
+	shrink->private_data	= c;
+	shrinker_register(shrink);
+
+	return 0;
+err:
+	return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+	mutex_init(&bc->lock);
+	INIT_LIST_HEAD(&bc->live);
+	INIT_LIST_HEAD(&bc->freeable);
+	INIT_LIST_HEAD(&bc->freed_pcpu);
+	INIT_LIST_HEAD(&bc->freed_nonpcpu);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bc->alloc_lock == current) {
+		trace_and_count(c, btree_cache_cannibalize_unlock, c);
+		bc->alloc_lock = NULL;
+		closure_wake_up(&bc->alloc_wait);
+	}
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct task_struct *old;
+
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+		return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+	}
+
+	closure_wait(&bc->alloc_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&bc->alloc_wait);
+		goto success;
+	}
+
+	trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+	return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+
+success:
+	trace_and_count(c, btree_cache_cannibalize_lock, c);
+	return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &bc->live, list)
+		if (!btree_node_reclaim(c, b))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &bc->live, list)
+			if (!btree_node_write_and_reclaim(c, b))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct list_head *freed = pcpu_read_locks
+		? &bc->freed_pcpu
+		: &bc->freed_nonpcpu;
+	struct btree *b, *b2;
+	u64 start_time = local_clock();
+	unsigned flags;
+
+	flags = memalloc_nofs_save();
+	mutex_lock(&bc->lock);
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, freed, list)
+		if (!btree_node_reclaim(c, b)) {
+			list_del_init(&b->list);
+			goto got_node;
+		}
+
+	b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
+	if (!b) {
+		mutex_unlock(&bc->lock);
+		bch2_trans_unlock(trans);
+		b = __btree_node_mem_alloc(c, GFP_KERNEL);
+		if (!b)
+			goto err;
+		mutex_lock(&bc->lock);
+	}
+
+	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
+
+	BUG_ON(!six_trylock_intent(&b->c.lock));
+	BUG_ON(!six_trylock_write(&b->c.lock));
+got_node:
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b2, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b2)) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			btree_node_to_freedlist(bc, b2);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+			goto got_mem;
+		}
+
+	mutex_unlock(&bc->lock);
+
+	if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
+		bch2_trans_unlock(trans);
+		if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
+			goto err;
+	}
+
+	mutex_lock(&bc->lock);
+	bc->used++;
+got_mem:
+	mutex_unlock(&bc->lock);
+
+	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_write_in_flight(b));
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	bch2_btree_keys_init(b);
+	set_btree_node_accessed(b);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);
+
+	memalloc_nofs_restore(flags);
+	return b;
+err:
+	mutex_lock(&bc->lock);
+
+	/* Try to cannibalize another cached btree node: */
+	if (bc->alloc_lock == current) {
+		b2 = btree_node_cannibalize(c);
+		clear_btree_node_just_written(b2);
+		bch2_btree_node_hash_remove(bc, b2);
+
+		if (b) {
+			swap(b->data, b2->data);
+			swap(b->aux_data, b2->aux_data);
+			btree_node_to_freedlist(bc, b2);
+			six_unlock_write(&b2->c.lock);
+			six_unlock_intent(&b2->c.lock);
+		} else {
+			b = b2;
+			list_del_init(&b->list);
+		}
+
+		mutex_unlock(&bc->lock);
+
+		trace_and_count(c, btree_cache_cannibalize, c);
+		goto out;
+	}
+
+	mutex_unlock(&bc->lock);
+	memalloc_nofs_restore(flags);
+	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
+				struct btree_path *path,
+				const struct bkey_i *k,
+				enum btree_id btree_id,
+				unsigned level,
+				enum six_lock_type lock_type,
+				bool sync)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	u32 seq;
+
+	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+	/*
+	 * Parent node must be locked, else we could read in a btree node that's
+	 * been freed:
+	 */
+	if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
+		trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
+	}
+
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+
+	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		trans->memory_allocation_failure = true;
+		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
+	}
+
+	if (IS_ERR(b))
+		return b;
+
+	/*
+	 * Btree nodes read in from disk should not have the accessed bit set
+	 * initially, so that linear scans don't thrash the cache:
+	 */
+	clear_btree_node_accessed(b);
+
+	bkey_copy(&b->key, k);
+	if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		b->hash_val = 0;
+
+		mutex_lock(&bc->lock);
+		list_add(&b->list, &bc->freeable);
+		mutex_unlock(&bc->lock);
+
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		return NULL;
+	}
+
+	set_btree_node_read_in_flight(b);
+
+	six_unlock_write(&b->c.lock);
+	seq = six_lock_seq(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	/* Unlock before doing IO: */
+	if (path && sync)
+		bch2_trans_unlock_noassert(trans);
+
+	bch2_btree_node_read(c, b, sync);
+
+	if (!sync)
+		return NULL;
+
+	if (path) {
+		int ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			BUG_ON(!trans->restarted);
+			return ERR_PTR(ret);
+		}
+	}
+
+	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
+		if (path)
+			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+	}
+
+	return b;
+}
+
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+{
+	struct printbuf buf = PRINTBUF;
+
+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+		return;
+
+	prt_printf(&buf,
+	       "btree node header doesn't match ptr\n"
+	       "btree %s level %u\n"
+	       "ptr: ",
+	       bch2_btree_id_str(b->c.btree_id), b->c.level);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	prt_printf(&buf, "\nheader: btree %s level %llu\n"
+	       "min ",
+	       bch2_btree_id_str(BTREE_NODE_ID(b->data)),
+	       BTREE_NODE_LEVEL(b->data));
+	bch2_bpos_to_text(&buf, b->data->min_key);
+
+	prt_printf(&buf, "\nmax ");
+	bch2_bpos_to_text(&buf, b->data->max_key);
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+	printbuf_exit(&buf);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+	if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+	    b->c.level != BTREE_NODE_LEVEL(b->data) ||
+	    !bpos_eq(b->data->max_key, b->key.k.p) ||
+	    (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	     !bpos_eq(b->data->min_key,
+		      bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+		btree_bad_header(c, b);
+}
+
+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+					   const struct bkey_i *k, unsigned level,
+					   enum six_lock_type lock_type,
+					   unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+	bool need_relock = false;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch2_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch2_btree_node_fill(trans, path, k, path->btree_id,
+					 level, lock_type, true);
+		need_relock = true;
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		if (btree_node_read_locked(path, level + 1))
+			btree_node_unlock(trans, path, level + 1);
+
+		ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
+
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->c.lock, lock_type);
+			if (bch2_btree_node_relock(trans, path, level + 1))
+				goto retry;
+
+			trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+			return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+		}
+
+		/* avoid atomic set bit if it's not needed: */
+		if (!btree_node_accessed(b))
+			set_btree_node_accessed(b);
+	}
+
+	if (unlikely(btree_node_read_in_flight(b))) {
+		u32 seq = six_lock_seq(&b->c.lock);
+
+		six_unlock_type(&b->c.lock, lock_type);
+		bch2_trans_unlock(trans);
+		need_relock = true;
+
+		bch2_btree_node_wait_on_read(b);
+
+		/*
+		 * should_be_locked is not set on this path yet, so we need to
+		 * relock it specifically:
+		 */
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			goto retry;
+	}
+
+	if (unlikely(need_relock)) {
+		ret = bch2_trans_relock(trans) ?:
+			bch2_btree_path_relock_intent(trans, path);
+		if (ret) {
+			six_unlock_type(&b->c.lock, lock_type);
+			return ERR_PTR(ret);
+		}
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+
+	return b;
+}
+
+/**
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * @trans:	btree transaction object
+ * @path:	btree_path being traversed
+ * @k:		pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:	level of btree node being looked up (0 == leaf node)
+ * @lock_type:	SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:	ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
+ */
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bset_tree *t;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_node_mem_ptr(k);
+
+	/*
+	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
+	 * be the node we want anymore, and trying to lock the wrong node could
+	 * cause an unneccessary transaction restart:
+	 */
+	if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
+		     !b ||
+		     b->hash_val != btree_ptr_hash_val(k)))
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ERR_PTR(ret);
+
+	BUG_ON(ret);
+
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+		     b->c.level != level ||
+		     race_fault())) {
+		six_unlock_type(&b->c.lock, lock_type);
+		if (bch2_btree_node_relock(trans, path, level + 1))
+			return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+
+		trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
+	}
+
+	if (unlikely(btree_node_read_in_flight(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+	}
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->c.lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->c.btree_id != path->btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+
+	return b;
+}
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
+					 const struct bkey_i *k,
+					 enum btree_id btree_id,
+					 unsigned level,
+					 bool nofill)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+	int ret;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (c->opts.btree_node_mem_ptr_optimization) {
+		b = btree_node_mem_ptr(k);
+		if (b)
+			goto lock_node;
+	}
+retry:
+	b = btree_cache_find(bc, k);
+	if (unlikely(!b)) {
+		if (nofill)
+			goto out;
+
+		b = bch2_btree_node_fill(trans, NULL, k, btree_id,
+					 level, SIX_LOCK_read, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b) &&
+		    !bch2_btree_cache_cannibalize_lock(c, NULL))
+			goto retry;
+
+		if (IS_ERR(b))
+			goto out;
+	} else {
+lock_node:
+		ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			return ERR_PTR(ret);
+
+		BUG_ON(ret);
+
+		if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+			     b->c.btree_id != btree_id ||
+			     b->c.level != level)) {
+			six_unlock_read(&b->c.lock);
+			goto retry;
+		}
+	}
+
+	/* XXX: waiting on IO with btree locks held: */
+	__bch2_btree_node_wait_on_read(b);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (!btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_read(&b->c.lock);
+		b = ERR_PTR(-EIO);
+		goto out;
+	}
+
+	EBUG_ON(b->c.btree_id != btree_id);
+	EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+	btree_check_header(c, b);
+out:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return b;
+}
+
+int bch2_btree_node_prefetch(struct btree_trans *trans,
+			     struct btree_path *path,
+			     const struct bkey_i *k,
+			     enum btree_id btree_id, unsigned level)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	BUG_ON(trans && !btree_node_locked(path, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = btree_cache_find(bc, k);
+	if (b)
+		return 0;
+
+	b = bch2_btree_node_fill(trans, path, k, btree_id,
+				 level, SIX_LOCK_read, false);
+	return PTR_ERR_OR_ZERO(b);
+}
+
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	b = btree_cache_find(bc, k);
+	if (!b)
+		return;
+wait_on_io:
+	/* not allowed to wait on io with btree locks held: */
+
+	/* XXX we're called from btree_gc which will be holding other btree
+	 * nodes locked
+	 */
+	__bch2_btree_node_wait_on_read(b);
+	__bch2_btree_node_wait_on_write(b);
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+	if (btree_node_dirty(b)) {
+		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
+		six_unlock_write(&b->c.lock);
+		six_unlock_intent(&b->c.lock);
+		goto wait_on_io;
+	}
+
+	BUG_ON(btree_node_dirty(b));
+
+	mutex_lock(&bc->lock);
+	btree_node_data_free(c, b);
+	bch2_btree_node_hash_remove(bc, b);
+	mutex_unlock(&bc->lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+	return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+	prt_printf(out, "%s level %u/%u\n  ",
+	       bch2_btree_id_str(b->c.btree_id),
+	       b->c.level,
+	       bch2_btree_id_root(c, b->c.btree_id)->level);
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+	struct bset_stats stats;
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch2_btree_keys_stats(b, &stats);
+
+	prt_printf(out, "l %u ", b->c.level);
+	bch2_bpos_to_text(out, b->data->min_key);
+	prt_printf(out, " - ");
+	bch2_bpos_to_text(out, b->data->max_key);
+	prt_printf(out, ":\n"
+	       "    ptrs: ");
+	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out,
+	       "    format: ");
+	bch2_bkey_format_to_text(out, &b->format);
+
+	prt_printf(out,
+	       "    unpack fn len: %u\n"
+	       "    bytes used %zu/%zu (%zu%% full)\n"
+	       "    sib u64s: %u, %u (merge threshold %u)\n"
+	       "    nr packed keys %u\n"
+	       "    nr unpacked keys %u\n"
+	       "    floats %zu\n"
+	       "    failed unpacked %zu\n",
+	       b->unpack_fn_len,
+	       b->nr.live_u64s * sizeof(u64),
+	       btree_bytes(c) - sizeof(struct btree_node),
+	       b->nr.live_u64s * 100 / btree_max_u64s(c),
+	       b->sib_u64s[0],
+	       b->sib_u64s[1],
+	       c->btree_foreground_merge_threshold,
+	       b->nr.packed_keys,
+	       b->nr.unpacked_keys,
+	       stats.floats,
+	       stats.failed);
+}
+
+void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+{
+	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+	prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
new file mode 100644
index 000000000000..cfb80b201d61
--- /dev/null
+++ b/fs/bcachefs/btree_cache.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "bkey_methods.h"
+
+extern const char * const bch2_btree_node_flags[];
+
+struct btree_iter;
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+				unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
+
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type, unsigned long);
+
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
+					 enum btree_id, unsigned, bool);
+
+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *,
+			     const struct bkey_i *, enum btree_id, unsigned);
+
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+		return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+	case KEY_TYPE_btree_ptr_v2:
+		/*
+		 * The cast/deref is only necessary to avoid sparse endianness
+		 * warnings:
+		 */
+		return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq);
+	default:
+		return 0;
+	}
+}
+
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
+		: NULL;
+}
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+	return b->hash_val != 0;
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+	return c->opts.btree_node_size;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return btree_bytes(c) / PAGE_SIZE;
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+	return btree_sectors(c) >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
+
+static inline unsigned btree_id_nr_alive(struct bch_fs *c)
+{
+	return BTREE_ID_NR + c->btree_roots_extra.nr;
+}
+
+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id)
+{
+	if (likely(id < BTREE_ID_NR)) {
+		return &c->btree_roots_known[id];
+	} else {
+		unsigned idx = id - BTREE_ID_NR;
+
+		EBUG_ON(idx >= c->btree_roots_extra.nr);
+		return &c->btree_roots_extra.data[idx];
+	}
+}
+
+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
+{
+	return bch2_btree_id_root(c, b->c.btree_id)->b;
+}
+
+const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
new file mode 100644
index 000000000000..30ab78a24517
--- /dev/null
+++ b/fs/bcachefs/btree_gc.c
@@ -0,0 +1,2146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright (C) 2014 Datera Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/task.h>
+
+#define DROP_THIS_NODE		10
+#define DROP_PREV_NODE		11
+
+static bool should_restart_for_topology_repair(struct bch_fs *c)
+{
+	return c->opts.fix_errors != FSCK_FIX_no &&
+		!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+}
+
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	preempt_disable();
+	write_seqcount_begin(&c->gc_pos_lock);
+	c->gc_pos = new_pos;
+	write_seqcount_end(&c->gc_pos_lock);
+	preempt_enable();
+}
+
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
+{
+	BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
+	__gc_pos_set(c, new_pos);
+}
+
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
+static int bch2_gc_check_topology(struct bch_fs *c,
+				  struct btree *b,
+				  struct bkey_buf *prev,
+				  struct bkey_buf cur,
+				  bool is_last)
+{
+	struct bpos node_start	= b->data->min_key;
+	struct bpos node_end	= b->data->max_key;
+	struct bpos expected_start = bkey_deleted(&prev->k->k)
+		? node_start
+		: bpos_successor(prev->k->k.p);
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
+
+		if (!bpos_eq(expected_start, bp->v.min_key)) {
+			bch2_topology_error(c);
+
+			if (bkey_deleted(&prev->k->k)) {
+				prt_printf(&buf1, "start of node: ");
+				bch2_bpos_to_text(&buf1, node_start);
+			} else {
+				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
+			}
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
+
+			if (__fsck_err(c,
+				       FSCK_CAN_FIX|
+				       FSCK_CAN_IGNORE|
+				       FSCK_NO_RATELIMIT,
+				       btree_node_topology_bad_min_key,
+				       "btree node with incorrect min_key at btree %s level %u:\n"
+				       "  prev %s\n"
+				       "  cur %s",
+				       bch2_btree_id_str(b->c.btree_id), b->c.level,
+				       buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
+				bch_info(c, "Halting mark and sweep to start topology repair pass");
+				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+				goto err;
+			} else {
+				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+			}
+		}
+	}
+
+	if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
+		bch2_bpos_to_text(&buf2, node_end);
+
+		if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
+			  btree_node_topology_bad_max_key,
+			  "btree node with incorrect max_key at btree %s level %u:\n"
+			  "  %s\n"
+			  "  expected %s",
+			  bch2_btree_id_str(b->c.btree_id), b->c.level,
+			  buf1.buf, buf2.buf) &&
+		    should_restart_for_topology_repair(c)) {
+			bch_info(c, "Halting mark and sweep to start topology repair pass");
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+			goto err;
+		} else {
+			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+		}
+	}
+
+	bch2_bkey_buf_copy(prev, c, cur.k);
+err:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+	switch (b->key.k.type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+		dst->k.p		= src->k.p;
+		dst->v.mem_ptr		= 0;
+		dst->v.seq		= b->data->keys.seq;
+		dst->v.sectors_written	= 0;
+		dst->v.flags		= 0;
+		dst->v.min_key		= b->data->min_key;
+		set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+		memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+		break;
+	}
+	case KEY_TYPE_btree_ptr_v2:
+		bkey_copy(&dst->k_i, &b->key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
+					     enum btree_id btree, unsigned level,
+					     struct bkey_s_c old, struct bkey_i *new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	struct bkey_buf tmp;
+	int ret;
+
+	bch2_bkey_buf_init(&tmp);
+	bch2_bkey_buf_reassemble(&tmp, c, old);
+
+	b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
+	if (!IS_ERR_OR_NULL(b)) {
+		mutex_lock(&c->btree_cache.lock);
+
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+
+		mutex_unlock(&c->btree_cache.lock);
+		six_unlock_read(&b->c.lock);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->min_key	= new_min;
+	new->v.min_key		= new_min;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+	bkey_copy(&b->key, &new->k_i);
+	return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+	struct bkey_i_btree_ptr_v2 *new;
+	int ret;
+
+	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+	if (ret)
+		return ret;
+
+	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
+	if (!new)
+		return -BCH_ERR_ENOMEM_gc_repair_key;
+
+	btree_ptr_to_v2(b, new);
+	b->data->max_key	= new_max;
+	new->k.p		= new_max;
+	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+	if (ret) {
+		kfree(new);
+		return ret;
+	}
+
+	bch2_btree_node_drop_keys_outside_node(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+	bkey_copy(&b->key, &new->k_i);
+	ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+	BUG_ON(ret);
+	mutex_unlock(&c->btree_cache.lock);
+	return 0;
+}
+
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+					struct btree *prev, struct btree *cur)
+{
+	struct bpos expected_start = !prev
+		? b->data->min_key
+		: bpos_successor(prev->key.k.p);
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (!prev) {
+		prt_printf(&buf1, "start of node: ");
+		bch2_bpos_to_text(&buf1, b->data->min_key);
+	} else {
+		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+	}
+
+	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
+
+	if (prev &&
+	    bpos_gt(expected_start, cur->data->min_key) &&
+	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+		/* cur overwrites prev: */
+
+		if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
+						cur->data->min_key), c,
+				btree_node_topology_overwritten_by_next_node,
+				"btree node overwritten by next node at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_id_str(b->c.btree_id), b->c.level,
+				buf1.buf, buf2.buf)) {
+			ret = DROP_PREV_NODE;
+			goto out;
+		}
+
+		if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
+						 bpos_predecessor(cur->data->min_key)), c,
+				btree_node_topology_bad_max_key,
+				"btree node with incorrect max_key at btree %s level %u:\n"
+				"  node %s\n"
+				"  next %s",
+				bch2_btree_id_str(b->c.btree_id), b->c.level,
+				buf1.buf, buf2.buf))
+			ret = set_node_max(c, prev,
+					   bpos_predecessor(cur->data->min_key));
+	} else {
+		/* prev overwrites cur: */
+
+		if (mustfix_fsck_err_on(bpos_ge(expected_start,
+						cur->data->max_key), c,
+				btree_node_topology_overwritten_by_prev_node,
+				"btree node overwritten by prev node at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_id_str(b->c.btree_id), b->c.level,
+				buf1.buf, buf2.buf)) {
+			ret = DROP_THIS_NODE;
+			goto out;
+		}
+
+		if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+				btree_node_topology_bad_min_key,
+				"btree node with incorrect min_key at btree %s level %u:\n"
+				"  prev %s\n"
+				"  node %s",
+				bch2_btree_id_str(b->c.btree_id), b->c.level,
+				buf1.buf, buf2.buf))
+			ret = set_node_min(c, cur, expected_start);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+				 struct btree *child)
+{
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	int ret = 0;
+
+	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
+	bch2_bpos_to_text(&buf2, b->key.k.p);
+
+	if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+				btree_node_topology_bad_max_key,
+			"btree node with incorrect max_key at btree %s level %u:\n"
+			"  %s\n"
+			"  expected %s",
+			bch2_btree_id_str(b->c.btree_id), b->c.level,
+			buf1.buf, buf2.buf)) {
+		ret = set_node_max(c, child, b->key.k.p);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf prev_k, cur_k;
+	struct btree *prev = NULL, *cur = NULL;
+	bool have_child, dropped_children = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!b->c.level)
+		return 0;
+again:
+	prev = NULL;
+	have_child = dropped_children = false;
+	bch2_bkey_buf_init(&prev_k);
+	bch2_bkey_buf_init(&cur_k);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+		bch2_btree_and_journal_iter_advance(&iter);
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		printbuf_reset(&buf);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
+
+		if (mustfix_fsck_err_on(ret == -EIO, c,
+				btree_node_unreadable,
+				"Topology repair: unreadable btree node at btree %s level %u:\n"
+				"  %s",
+				bch2_btree_id_str(b->c.btree_id),
+				b->c.level - 1,
+				buf.buf)) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
+
+		if (ret) {
+			bch_err_msg(c, ret, "getting btree node");
+			break;
+		}
+
+		ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+		if (ret == DROP_THIS_NODE) {
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
+
+		if (prev)
+			six_unlock_read(&prev->c.lock);
+		prev = NULL;
+
+		if (ret == DROP_PREV_NODE) {
+			bch2_btree_node_evict(trans, prev_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, prev_k.k->k.p);
+			if (ret)
+				break;
+
+			bch2_btree_and_journal_iter_exit(&iter);
+			bch2_bkey_buf_exit(&prev_k, c);
+			bch2_bkey_buf_exit(&cur_k, c);
+			goto again;
+		} else if (ret)
+			break;
+
+		prev = cur;
+		cur = NULL;
+		bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
+	}
+
+	if (!ret && !IS_ERR_OR_NULL(prev)) {
+		BUG_ON(cur);
+		ret = btree_repair_node_end(c, b, prev);
+	}
+
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	prev = NULL;
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+	cur = NULL;
+
+	if (ret)
+		goto err;
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		bch2_bkey_buf_reassemble(&cur_k, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
+
+		cur = bch2_btree_node_get_noiter(trans, cur_k.k,
+					b->c.btree_id, b->c.level - 1,
+					false);
+		ret = PTR_ERR_OR_ZERO(cur);
+
+		if (ret) {
+			bch_err_msg(c, ret, "getting btree node");
+			goto err;
+		}
+
+		ret = bch2_btree_repair_topology_recurse(trans, cur);
+		six_unlock_read(&cur->c.lock);
+		cur = NULL;
+
+		if (ret == DROP_THIS_NODE) {
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			dropped_children = true;
+		}
+
+		if (ret)
+			goto err;
+
+		have_child = true;
+	}
+
+	printbuf_reset(&buf);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	if (mustfix_fsck_err_on(!have_child, c,
+			btree_node_topology_interior_node_empty,
+			"empty interior btree node at btree %s level %u\n"
+			"  %s",
+			bch2_btree_id_str(b->c.btree_id),
+			b->c.level, buf.buf))
+		ret = DROP_THIS_NODE;
+err:
+fsck_err:
+	if (!IS_ERR_OR_NULL(prev))
+		six_unlock_read(&prev->c.lock);
+	if (!IS_ERR_OR_NULL(cur))
+		six_unlock_read(&cur->c.lock);
+
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_bkey_buf_exit(&prev_k, c);
+	bch2_bkey_buf_exit(&cur_k, c);
+
+	if (!ret && dropped_children)
+		goto again;
+
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_topology(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree *b;
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		b = r->b;
+		if (btree_node_fake(b))
+			continue;
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		ret = bch2_btree_repair_topology_recurse(trans, b);
+		six_unlock_read(&b->c.lock);
+
+		if (ret == DROP_THIS_NODE) {
+			bch_err(c, "empty btree root - repair unimplemented");
+			ret = -BCH_ERR_fsck_repair_unimplemented;
+		}
+	}
+
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
+			       unsigned level, bool is_root,
+			       struct bkey_s_c *k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+	const union bch_extent_entry *entry_c;
+	struct extent_ptr_decoded p = { 0 };
+	bool do_update = false;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	/*
+	 * XXX
+	 * use check_bucket_ref here
+	 */
+	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
+
+		if (!g->gen_valid &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, ptr_to_missing_alloc_key,
+			      "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+			if (!p.ptr.cached) {
+				g->gen_valid		= true;
+				g->gen			= p.ptr.gen;
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
+			      "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+			if (!p.ptr.cached) {
+				g->gen_valid		= true;
+				g->gen			= p.ptr.gen;
+				g->data_type		= 0;
+				g->dirty_sectors	= 0;
+				g->cached_sectors	= 0;
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
+			      "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+			do_update = true;
+
+		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
+		    (c->opts.reconstruct_alloc ||
+		     fsck_err(c, stale_dirty_ptr,
+			      "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+			      "while marking %s",
+			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+			      bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+			      p.ptr.gen, g->gen,
+			      (printbuf_reset(&buf),
+			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+			do_update = true;
+
+		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+			continue;
+
+		if (fsck_err_on(bucket_data_type(g->data_type) &&
+				bucket_data_type(g->data_type) != data_type, c,
+				ptr_bucket_data_type_mismatch,
+				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_types[g->data_type],
+				bch2_data_types[data_type],
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
+			if (data_type == BCH_DATA_btree) {
+				g->data_type	= data_type;
+				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+			} else {
+				do_update = true;
+			}
+		}
+
+		if (p.has_ec) {
+			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
+
+			if (fsck_err_on(!m || !m->alive, c,
+					ptr_to_missing_stripe,
+					"pointer to nonexistent stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+				do_update = true;
+
+			if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+					ptr_to_incorrect_stripe,
+					"pointer does not match stripe %llu\n"
+					"while marking %s",
+					(u64) p.ec.idx,
+					(printbuf_reset(&buf),
+					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
+				do_update = true;
+		}
+	}
+
+	if (do_update) {
+		struct bkey_ptrs ptrs;
+		union bch_extent_entry *entry;
+		struct bch_extent_ptr *ptr;
+		struct bkey_i *new;
+
+		if (is_root) {
+			bch_err(c, "cannot update btree roots yet");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+		if (!new) {
+			bch_err_msg(c, ret, "allocating new key");
+			ret = -BCH_ERR_ENOMEM_gc_repair_key;
+			goto err;
+		}
+
+		bkey_reassemble(new, *k);
+
+		if (level) {
+			/*
+			 * We don't want to drop btree node pointers - if the
+			 * btree node isn't there anymore, the read path will
+			 * sort it out:
+			 */
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr(ptrs, ptr) {
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+
+				ptr->gen = g->gen;
+			}
+		} else {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
+				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
+
+				(ptr->cached &&
+				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
+				(!ptr->cached &&
+				 gen_cmp(ptr->gen, g->gen) < 0) ||
+				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+				(g->data_type &&
+				 g->data_type != data_type);
+			}));
+again:
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_extent_entry_for_each(ptrs, entry) {
+				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
+									entry->stripe_ptr.idx);
+					union bch_extent_entry *next_ptr;
+
+					bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+						if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+							goto found;
+					next_ptr = NULL;
+found:
+					if (!next_ptr) {
+						bch_err(c, "aieee, found stripe ptr with no data ptr");
+						continue;
+					}
+
+					if (!m || !m->alive ||
+					    !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+								       &next_ptr->ptr,
+								       m->sectors)) {
+						bch2_bkey_extent_entry_drop(new, entry);
+						goto again;
+					}
+				}
+			}
+		}
+
+		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+		if (ret) {
+			kfree(new);
+			goto err;
+		}
+
+		if (level)
+			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
+
+		if (0) {
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, *k);
+			bch_info(c, "updated %s", buf.buf);
+
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+			bch_info(c, "new key %s", buf.buf);
+		}
+
+		*k = bkey_i_to_s_c(new);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* marking of btree keys/nodes: */
+
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
+			    unsigned level, bool is_root,
+			    struct bkey_s_c *k,
+			    bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey deleted = KEY(0, 0, 0);
+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	unsigned flags =
+		BTREE_TRIGGER_GC|
+		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
+	int ret = 0;
+
+	deleted.p = k->k->p;
+
+	if (initial) {
+		BUG_ON(bch2_journal_seq_verify &&
+		       k->k->version.lo > atomic64_read(&c->journal.seq));
+
+		ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+				bkey_version_in_future,
+				"key version number higher than recorded: %llu > %llu",
+				k->k->version.lo,
+				atomic64_read(&c->key_version)))
+			atomic64_set(&c->key_version, k->k->version.lo);
+	}
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_mark_key(trans, btree_id, level, old, *k, flags));
+fsck_err:
+err:
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_node_iter iter;
+	struct bkey unpacked;
+	struct bkey_s_c k;
+	struct bkey_buf prev, cur;
+	int ret = 0;
+
+	if (!btree_node_type_needs_gc(btree_node_type(b)))
+		return 0;
+
+	bch2_btree_node_iter_init_from_start(&iter, b);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
+
+	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
+				       &k, initial);
+		if (ret)
+			break;
+
+		bch2_btree_node_iter_advance(&iter, b);
+
+		if (b->c.level) {
+			bch2_bkey_buf_reassemble(&cur, c, k);
+
+			ret = bch2_gc_check_topology(c, b, &prev, cur,
+					bch2_btree_node_iter_end(&iter));
+			if (ret)
+				break;
+		}
+	}
+
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	return ret;
+}
+
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
+			 bool initial, bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned depth = metadata_only ? 1 : 0;
+	int ret = 0;
+
+	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+	__for_each_btree_node(trans, iter, btree_id, POS_MIN,
+			      0, depth, BTREE_ITER_PREFETCH, b, ret) {
+		bch2_verify_btree_nr_keys(b);
+
+		gc_pos_set(c, gc_pos_btree_node(b));
+
+		ret = btree_gc_mark_node(trans, b, initial);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->btree_root_lock);
+	b = bch2_btree_id_root(c, btree_id)->b;
+	if (!btree_node_fake(b)) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
+				       true, &k, initial);
+	}
+	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
+	mutex_unlock(&c->btree_root_lock);
+
+	return ret;
+}
+
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
+				      unsigned target_depth)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_and_journal_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf cur, prev;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+	bch2_bkey_buf_init(&prev);
+	bch2_bkey_buf_init(&cur);
+	bkey_init(&prev.k->k);
+
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+				       false, &k, true);
+		if (ret)
+			goto fsck_err;
+
+		if (b->c.level) {
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			k = bkey_i_to_s_c(cur.k);
+
+			bch2_btree_and_journal_iter_advance(&iter);
+
+			ret = bch2_gc_check_topology(c, b,
+					&prev, cur,
+					!bch2_btree_and_journal_iter_peek(&iter).k);
+			if (ret)
+				goto fsck_err;
+		} else {
+			bch2_btree_and_journal_iter_advance(&iter);
+		}
+	}
+
+	if (b->c.level > target_depth) {
+		bch2_btree_and_journal_iter_exit(&iter);
+		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+			struct btree *child;
+
+			bch2_bkey_buf_reassemble(&cur, c, k);
+			bch2_btree_and_journal_iter_advance(&iter);
+
+			child = bch2_btree_node_get_noiter(trans, cur.k,
+						b->c.btree_id, b->c.level - 1,
+						false);
+			ret = PTR_ERR_OR_ZERO(child);
+
+			if (ret == -EIO) {
+				bch2_topology_error(c);
+
+				if (__fsck_err(c,
+					  FSCK_CAN_FIX|
+					  FSCK_CAN_IGNORE|
+					  FSCK_NO_RATELIMIT,
+					  btree_node_read_error,
+					  "Unreadable btree node at btree %s level %u:\n"
+					  "  %s",
+					  bch2_btree_id_str(b->c.btree_id),
+					  b->c.level - 1,
+					  (printbuf_reset(&buf),
+					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
+				    should_restart_for_topology_repair(c)) {
+					bch_info(c, "Halting mark and sweep to start topology repair pass");
+					ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+					goto fsck_err;
+				} else {
+					/* Continue marking when opted to not
+					 * fix the error: */
+					ret = 0;
+					set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+					continue;
+				}
+			} else if (ret) {
+				bch_err_msg(c, ret, "getting btree node");
+				break;
+			}
+
+			ret = bch2_gc_btree_init_recurse(trans, child,
+							 target_depth);
+			six_unlock_read(&child->c.lock);
+
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	bch2_btree_and_journal_iter_exit(&iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_btree_init(struct btree_trans *trans,
+			      enum btree_id btree_id,
+			      bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b;
+	unsigned target_depth = metadata_only ? 1 : 0;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	b = bch2_btree_id_root(c, btree_id)->b;
+
+	if (btree_node_fake(b))
+		return 0;
+
+	six_lock_read(&b->c.lock, NULL, NULL);
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->min_key);
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+				btree_root_bad_min_key,
+			"btree root with incorrect min_key: %s", buf.buf)) {
+		bch_err(c, "repair unimplemented");
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto fsck_err;
+	}
+
+	printbuf_reset(&buf);
+	bch2_bpos_to_text(&buf, b->data->max_key);
+	if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+				btree_root_bad_max_key,
+			"btree root with incorrect max_key: %s", buf.buf)) {
+		bch_err(c, "repair unimplemented");
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto fsck_err;
+	}
+
+	if (b->c.level >= target_depth)
+		ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
+
+	if (!ret) {
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+
+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true,
+				       &k, true);
+	}
+fsck_err:
+	six_unlock_read(&b->c.lock);
+
+	if (ret < 0)
+		bch_err_fn(c, ret);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+	return  (int) btree_id_to_gc_phase(l) -
+		(int) btree_id_to_gc_phase(r);
+}
+
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	enum btree_id ids[BTREE_ID_NR];
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		ids[i] = i;
+	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
+	for (i = 0; i < BTREE_ID_NR && !ret; i++)
+		ret = initial
+			? bch2_gc_btree_init(trans, ids[i], metadata_only)
+			: bch2_gc_btree(trans, ids[i], initial, metadata_only);
+
+	for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
+		if (!bch2_btree_id_root(c, i)->alive)
+			continue;
+
+		ret = initial
+			? bch2_gc_btree_init(trans, i, metadata_only)
+			: bch2_gc_btree(trans, i, initial, metadata_only);
+	}
+
+	if (ret < 0)
+		bch_err_fn(c, ret);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bch_data_type type,
+				  unsigned flags)
+{
+	u64 b = sector_to_bucket(ca, start);
+
+	do {
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+					  gc_phase(GC_PHASE_SB), flags);
+		b++;
+		start += sectors;
+	} while (start < end);
+}
+
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+				     unsigned flags)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	unsigned i;
+	u64 b;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR)
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BCH_DATA_sb, flags);
+
+		mark_metadata_sectors(c, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, flags);
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		b = ca->journal.buckets[i];
+		bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
+					  ca->mi.bucket_size,
+					  gc_phase(GC_PHASE_SB), flags);
+	}
+}
+
+static void bch2_mark_superblocks(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&c->sb_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));
+
+	for_each_online_member(ca, c, i)
+		bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
+	mutex_unlock(&c->sb_lock);
+}
+
+#if 0
+/* Also see bch2_pending_btree_node_free_insert_done() */
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
+{
+	struct btree_update *as;
+	struct pending_btree_node_free *d;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
+
+	for_each_pending_btree_node_free(c, as, d)
+		if (d->index_update_done)
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+#endif
+
+static void bch2_gc_free(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	genradix_free(&c->reflink_gc_table);
+	genradix_free(&c->gc_stripes);
+
+	for_each_member_device(ca, c, i) {
+		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
+			sizeof(struct bucket_array) +
+			ca->mi.nbuckets * sizeof(struct bucket));
+		ca->buckets_gc = NULL;
+
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+}
+
+static int bch2_gc_done(struct bch_fs *c,
+			bool initial, bool metadata_only)
+{
+	struct bch_dev *ca = NULL;
+	struct printbuf buf = PRINTBUF;
+	bool verify = !metadata_only &&
+		!c->opts.reconstruct_alloc &&
+		(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+	unsigned i, dev;
+	int ret = 0;
+
+	percpu_down_write(&c->mark_lock);
+
+#define copy_field(_err, _f, _msg, ...)					\
+	if (dst->_f != src->_f &&					\
+	    (!verify ||							\
+	     fsck_err(c, _err, _msg ": got %llu, should be %llu"	\
+		      , ##__VA_ARGS__, dst->_f, src->_f)))		\
+		dst->_f = src->_f
+#define copy_dev_field(_err, _f, _msg, ...)				\
+	copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_err, _f, _msg, ...)				\
+	copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for_each_member_device(ca, c, dev) {
+		struct bch_dev_usage *dst = ca->usage_base;
+		struct bch_dev_usage *src = (void *)
+			bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
+					     dev_usage_u64s());
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			copy_dev_field(dev_usage_buckets_wrong,
+				       d[i].buckets,	"%s buckets", bch2_data_types[i]);
+			copy_dev_field(dev_usage_sectors_wrong,
+				       d[i].sectors,	"%s sectors", bch2_data_types[i]);
+			copy_dev_field(dev_usage_fragmented_wrong,
+				       d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
+		}
+
+		copy_dev_field(dev_usage_buckets_ec_wrong,
+			       buckets_ec,		"buckets_ec");
+	}
+
+	{
+		unsigned nr = fs_usage_u64s(c);
+		struct bch_fs_usage *dst = c->usage_base;
+		struct bch_fs_usage *src = (void *)
+			bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
+
+		copy_fs_field(fs_usage_hidden_wrong,
+			      hidden,		"hidden");
+		copy_fs_field(fs_usage_btree_wrong,
+			      btree,		"btree");
+
+		if (!metadata_only) {
+			copy_fs_field(fs_usage_data_wrong,
+				      data,	"data");
+			copy_fs_field(fs_usage_cached_wrong,
+				      cached,	"cached");
+			copy_fs_field(fs_usage_reserved_wrong,
+				      reserved,	"reserved");
+			copy_fs_field(fs_usage_nr_inodes_wrong,
+				      nr_inodes,"nr_inodes");
+
+			for (i = 0; i < BCH_REPLICAS_MAX; i++)
+				copy_fs_field(fs_usage_persistent_reserved_wrong,
+					      persistent_reserved[i],
+					      "persistent_reserved[%i]", i);
+		}
+
+		for (i = 0; i < c->replicas.nr; i++) {
+			struct bch_replicas_entry *e =
+				cpu_replicas_entry(&c->replicas, i);
+
+			if (metadata_only &&
+			    (e->data_type == BCH_DATA_user ||
+			     e->data_type == BCH_DATA_cached))
+				continue;
+
+			printbuf_reset(&buf);
+			bch2_replicas_entry_to_text(&buf, e);
+
+			copy_fs_field(fs_usage_replicas_wrong,
+				      replicas[i], "%s", buf.buf);
+		}
+	}
+
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_stripe_field
+#undef copy_field
+fsck_err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
+	if (ret)
+		bch_err_fn(c, ret);
+
+	percpu_up_write(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+	struct bch_dev *ca = NULL;
+	unsigned i;
+
+	BUG_ON(c->usage_gc);
+
+	c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+					 sizeof(u64), GFP_KERNEL);
+	if (!c->usage_gc) {
+		bch_err(c, "error allocating c->usage_gc");
+		return -BCH_ERR_ENOMEM_gc_start;
+	}
+
+	for_each_member_device(ca, c, i) {
+		BUG_ON(ca->usage_gc);
+
+		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage_gc) {
+			bch_err(c, "error allocating ca->usage_gc");
+			percpu_ref_put(&ca->ref);
+			return -BCH_ERR_ENOMEM_gc_start;
+		}
+
+		this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+			       ca->mi.nbuckets - ca->mi.first_bucket);
+	}
+
+	return 0;
+}
+
+static int bch2_gc_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
+	}
+
+	free_percpu(c->usage_gc);
+	c->usage_gc = NULL;
+
+	return bch2_gc_start(c);
+}
+
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+				     struct bch_alloc_v4 r)
+{
+	return  l.gen != r.gen				||
+		l.oldest_gen != r.oldest_gen		||
+		l.data_type != r.data_type		||
+		l.dirty_sectors	!= r.dirty_sectors	||
+		l.cached_sectors != r.cached_sectors	 ||
+		l.stripe_redundancy != r.stripe_redundancy ||
+		l.stripe != r.stripe;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+				struct btree_iter *iter,
+				struct bkey_s_c k,
+				bool metadata_only)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+	struct bucket gc, *b;
+	struct bkey_i_alloc_v4 *a;
+	struct bch_alloc_v4 old_convert, new;
+	const struct bch_alloc_v4 *old;
+	enum bch_data_type type;
+	int ret;
+
+	if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
+		return 1;
+
+	old = bch2_alloc_to_v4(k, &old_convert);
+	new = *old;
+
+	percpu_down_read(&c->mark_lock);
+	b = gc_bucket(ca, iter->pos.offset);
+
+	/*
+	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
+	 * fix that here:
+	 */
+	type = __alloc_data_type(b->dirty_sectors,
+				 b->cached_sectors,
+				 b->stripe,
+				 *old,
+				 b->data_type);
+	if (b->data_type != type) {
+		struct bch_dev_usage *u;
+
+		preempt_disable();
+		u = this_cpu_ptr(ca->usage_gc);
+		u->d[b->data_type].buckets--;
+		b->data_type = type;
+		u->d[b->data_type].buckets++;
+		preempt_enable();
+	}
+
+	gc = *b;
+	percpu_up_read(&c->mark_lock);
+
+	if (metadata_only &&
+	    gc.data_type != BCH_DATA_sb &&
+	    gc.data_type != BCH_DATA_journal &&
+	    gc.data_type != BCH_DATA_btree)
+		return 0;
+
+	if (gen_after(old->gen, gc.gen))
+		return 0;
+
+	if (c->opts.reconstruct_alloc ||
+	    fsck_err_on(new.data_type != gc.data_type, c,
+			alloc_key_data_type_wrong,
+			"bucket %llu:%llu gen %u has wrong data_type"
+			": got %s, should be %s",
+			iter->pos.inode, iter->pos.offset,
+			gc.gen,
+			bch2_data_types[new.data_type],
+			bch2_data_types[gc.data_type]))
+		new.data_type = gc.data_type;
+
+#define copy_bucket_field(_errtype, _f)					\
+	if (c->opts.reconstruct_alloc ||				\
+	    fsck_err_on(new._f != gc._f, c, _errtype,			\
+			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
+			": got %u, should be %u",			\
+			iter->pos.inode, iter->pos.offset,		\
+			gc.gen,						\
+			bch2_data_types[gc.data_type],			\
+			new._f, gc._f))					\
+		new._f = gc._f;						\
+
+	copy_bucket_field(alloc_key_gen_wrong,
+			  gen);
+	copy_bucket_field(alloc_key_dirty_sectors_wrong,
+			  dirty_sectors);
+	copy_bucket_field(alloc_key_cached_sectors_wrong,
+			  cached_sectors);
+	copy_bucket_field(alloc_key_stripe_wrong,
+			  stripe);
+	copy_bucket_field(alloc_key_stripe_redundancy_wrong,
+			  stripe_redundancy);
+#undef copy_bucket_field
+
+	if (!bch2_alloc_v4_cmp(*old, new))
+		return 0;
+
+	a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		return ret;
+
+	a->v = new;
+
+	/*
+	 * The trigger normally makes sure this is set, but we're not running
+	 * triggers:
+	 */
+	if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+		a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+
+	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
+fsck_err:
+	return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_member_device(ca, c, i) {
+		ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+				POS(ca->dev_idx, ca->mi.first_bucket),
+				BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW,
+			bch2_alloc_write_key(trans, &iter, k, metadata_only));
+
+		if (ret < 0) {
+			bch_err_fn(c, ret);
+			percpu_ref_put(&ca->ref);
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+	return ret < 0 ? ret : 0;
+}
+
+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
+{
+	struct bch_dev *ca;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bucket *g;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	unsigned i;
+	int ret;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+				ca->mi.nbuckets * sizeof(struct bucket),
+				GFP_KERNEL|__GFP_ZERO);
+		if (!buckets) {
+			percpu_ref_put(&ca->ref);
+			bch_err(c, "error allocating ca->buckets[gc]");
+			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+			goto err;
+		}
+
+		buckets->first_bucket	= ca->mi.first_bucket;
+		buckets->nbuckets	= ca->mi.nbuckets;
+		rcu_assign_pointer(ca->buckets_gc, buckets);
+	}
+
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
+				  BTREE_ITER_PREFETCH, k, ({
+		ca = bch_dev_bkey_exists(c, k.k->p.inode);
+		g = gc_bucket(ca, k.k->p.offset);
+
+		a = bch2_alloc_to_v4(k, &a_convert);
+
+		g->gen_valid	= 1;
+		g->gen		= a->gen;
+
+		if (metadata_only &&
+		    (a->data_type == BCH_DATA_user ||
+		     a->data_type == BCH_DATA_cached ||
+		     a->data_type == BCH_DATA_parity)) {
+			g->data_type		= a->data_type;
+			g->dirty_sectors	= a->dirty_sectors;
+			g->cached_sectors	= a->cached_sectors;
+			g->stripe		= a->stripe;
+			g->stripe_redundancy	= a->stripe_redundancy;
+		}
+
+		0;
+	}));
+err:
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_array *buckets = gc_bucket_array(ca);
+		struct bucket *g;
+
+		for_each_bucket(g, buckets) {
+			if (metadata_only &&
+			    (g->data_type == BCH_DATA_user ||
+			     g->data_type == BCH_DATA_cached ||
+			     g->data_type == BCH_DATA_parity))
+				continue;
+			g->data_type = 0;
+			g->dirty_sectors = 0;
+			g->cached_sectors = 0;
+		}
+	}
+}
+
+static int bch2_gc_write_reflink_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     size_t *idx)
+{
+	struct bch_fs *c = trans->c;
+	const __le64 *refcount = bkey_refcount_c(k);
+	struct printbuf buf = PRINTBUF;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (!refcount)
+		return 0;
+
+	while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
+	       r->offset < k.k->p.offset)
+		++*idx;
+
+	if (!r ||
+	    r->offset != k.k->p.offset ||
+	    r->size != k.k->size) {
+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+		return -EINVAL;
+	}
+
+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+			reflink_v_refcount_wrong,
+			"reflink key has wrong refcount:\n"
+			"  %s\n"
+			"  should be %u",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+			r->refcount)) {
+		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
+
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		if (!r->refcount)
+			new->k.type = KEY_TYPE_deleted;
+		else
+			*bkey_refcount(new) = cpu_to_le64(r->refcount);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t idx = 0;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_reflink, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_reflink_key(trans, &iter, k, &idx));
+
+	c->reflink_gc_nr = 0;
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c,
+				 bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct reflink_gc *r;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+	c->reflink_gc_nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		const __le64 *refcount = bkey_refcount_c(k);
+
+		if (!refcount)
+			continue;
+
+		r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+				       GFP_KERNEL);
+		if (!r) {
+			ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+			break;
+		}
+
+		r->offset	= k.k->p.offset;
+		r->size		= k.k->size;
+		r->refcount	= 0;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
+{
+	struct genradix_iter iter;
+	struct reflink_gc *r;
+
+	genradix_for_each(&c->reflink_gc_table, iter, r)
+		r->refcount = 0;
+}
+
+static int bch2_gc_write_stripes_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	const struct bch_stripe *s;
+	struct gc_stripe *m;
+	bool bad = false;
+	unsigned i;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_stripe)
+		return 0;
+
+	s = bkey_s_c_to_stripe(k).v;
+	m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+	for (i = 0; i < s->nr_blocks; i++) {
+		u32 old = stripe_blockcount_get(s, i);
+		u32 new = (m ? m->block_sectors[i] : 0);
+
+		if (old != new) {
+			prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
+				   i, old, new);
+			bad = true;
+		}
+	}
+
+	if (bad)
+		bch2_bkey_val_to_text(&buf, c, k);
+
+	if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+			"%s", buf.buf)) {
+		struct bkey_i_stripe *new;
+
+		new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		bkey_reassemble(&new->k_i, k);
+
+		for (i = 0; i < new->v.nr_blocks; i++)
+			stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+		ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (metadata_only)
+		return 0;
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_stripes, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_gc_write_stripes_key(trans, &iter, k));
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
+{
+	genradix_free(&c->gc_stripes);
+}
+
+/**
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * @c:			filesystem object
+ * @initial:		are we in recovery?
+ * @metadata_only:	are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
+ */
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
+{
+	unsigned iter = 0;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	down_write(&c->gc_lock);
+
+	bch2_btree_interior_updates_flush(c);
+
+	ret   = bch2_gc_start(c) ?:
+		bch2_gc_alloc_start(c, metadata_only) ?:
+		bch2_gc_reflink_start(c, metadata_only);
+	if (ret)
+		goto out;
+again:
+	gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+	bch2_mark_superblocks(c);
+
+	ret = bch2_gc_btrees(c, initial, metadata_only);
+
+	if (ret)
+		goto out;
+
+#if 0
+	bch2_mark_pending_btree_node_frees(c);
+#endif
+	c->gc_count++;
+
+	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+	    (!iter && bch2_test_restart_gc)) {
+		if (iter++ > 2) {
+			bch_info(c, "Unable to fix bucket gens, looping");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * XXX: make sure gens we fixed got saved
+		 */
+		bch_info(c, "Second GC pass needed, restarting:");
+		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+		bch2_gc_stripes_reset(c, metadata_only);
+		bch2_gc_alloc_reset(c, metadata_only);
+		bch2_gc_reflink_reset(c, metadata_only);
+		ret = bch2_gc_reset(c);
+		if (ret)
+			goto out;
+
+		/* flush fsck errors, reset counters */
+		bch2_flush_fsck_errs(c);
+		goto again;
+	}
+out:
+	if (!ret) {
+		bch2_journal_block(&c->journal);
+
+		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
+			bch2_gc_reflink_done(c, metadata_only) ?:
+			bch2_gc_alloc_done(c, metadata_only) ?:
+			bch2_gc_done(c, initial, metadata_only);
+
+		bch2_journal_unblock(&c->journal);
+	}
+
+	percpu_down_write(&c->mark_lock);
+	/* Indicates that gc is no longer in progress: */
+	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+
+	bch2_gc_free(c);
+	percpu_up_write(&c->mark_lock);
+
+	up_write(&c->gc_lock);
+
+	/*
+	 * At startup, allocations can happen directly instead of via the
+	 * allocator thread - issue wakeup in case they blocked on gc_lock:
+	 */
+	closure_wake_up(&c->freelist_wait);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int gc_btree_gens_key(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *u;
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ptr_stale(ca, ptr) > 16) {
+			percpu_up_read(&c->mark_lock);
+			goto update;
+		}
+	}
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+
+		if (gen_after(*gen, ptr->gen))
+			*gen = ptr->gen;
+	}
+	percpu_up_read(&c->mark_lock);
+	return 0;
+update:
+	u = bch2_bkey_make_mut(trans, iter, &k, 0);
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bch2_extent_normalize(c, bkey_i_to_s(u));
+	return 0;
+}
+
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter,
+				       struct bkey_s_c k)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+	struct bkey_i_alloc_v4 *a_mut;
+	int ret;
+
+	if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
+		return 0;
+
+	a_mut = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a_mut);
+	if (ret)
+		return ret;
+
+	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+	a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
+
+	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_dev *ca;
+	u64 b, start_time = local_clock();
+	unsigned i;
+	int ret;
+
+	/*
+	 * Ideally we would be using state_lock and not gc_lock here, but that
+	 * introduces a deadlock in the RO path - we currently take the state
+	 * lock at the start of going RO, thus the gc thread may get stuck:
+	 */
+	if (!mutex_trylock(&c->gc_gens_lock))
+		return 0;
+
+	trace_and_count(c, gc_gens_start, c);
+	down_read(&c->gc_lock);
+	trans = bch2_trans_get(c);
+
+	for_each_member_device(ca, c, i) {
+		struct bucket_gens *gens = bucket_gens(ca);
+
+		BUG_ON(ca->oldest_gen);
+
+		ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
+		if (!ca->oldest_gen) {
+			percpu_ref_put(&ca->ref);
+			ret = -BCH_ERR_ENOMEM_gc_gens;
+			goto err;
+		}
+
+		for (b = gens->first_bucket;
+		     b < gens->nbuckets; b++)
+			ca->oldest_gen[b] = gens->b[b];
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (btree_type_has_ptrs(i)) {
+			c->gc_gens_btree = i;
+			c->gc_gens_pos = POS_MIN;
+
+			ret = for_each_btree_key_commit(trans, iter, i,
+					POS_MIN,
+					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+					k,
+					NULL, NULL,
+					BTREE_INSERT_NOFAIL,
+				gc_btree_gens_key(trans, &iter, k));
+			if (ret && !bch2_err_matches(ret, EROFS))
+				bch_err_fn(c, ret);
+			if (ret)
+				goto err;
+		}
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+			POS_MIN,
+			BTREE_ITER_PREFETCH,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_NOFAIL,
+		bch2_alloc_write_oldest_gen(trans, &iter, k));
+	if (ret && !bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
+	if (ret)
+		goto err;
+
+	c->gc_gens_btree	= 0;
+	c->gc_gens_pos		= POS_MIN;
+
+	c->gc_count++;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+	trace_and_count(c, gc_gens_end, c);
+err:
+	for_each_member_device(ca, c, i) {
+		kvfree(ca->oldest_gen);
+		ca->oldest_gen = NULL;
+	}
+
+	bch2_trans_put(trans);
+	up_read(&c->gc_lock);
+	mutex_unlock(&c->gc_gens_lock);
+	return ret;
+}
+
+static int bch2_gc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	unsigned long last = atomic64_read(&clock->now);
+	unsigned last_kick = atomic_read(&c->kick_gc);
+	int ret;
+
+	set_freezable();
+
+	while (1) {
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop()) {
+				__set_current_state(TASK_RUNNING);
+				return 0;
+			}
+
+			if (atomic_read(&c->kick_gc) != last_kick)
+				break;
+
+			if (c->btree_gc_periodic) {
+				unsigned long next = last + c->capacity / 16;
+
+				if (atomic64_read(&clock->now) >= next)
+					break;
+
+				bch2_io_clock_schedule_timeout(clock, next);
+			} else {
+				schedule();
+			}
+
+			try_to_freeze();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		last = atomic64_read(&clock->now);
+		last_kick = atomic_read(&c->kick_gc);
+
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
+		ret = bch2_gc(c, false, false);
+#else
+		ret = bch2_gc_gens(c);
+#endif
+		if (ret < 0)
+			bch_err_fn(c, ret);
+
+		debug_check_no_locks_held();
+	}
+
+	return 0;
+}
+
+void bch2_gc_thread_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	p = c->gc_thread;
+	c->gc_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_gc_thread_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->gc_thread)
+		return 0;
+
+	p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
+	if (IS_ERR(p)) {
+		bch_err_fn(c, PTR_ERR(p));
+		return PTR_ERR(p);
+	}
+
+	get_task_struct(p);
+	c->gc_thread = p;
+	wake_up_process(p);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
new file mode 100644
index 000000000000..607575f83a00
--- /dev/null
+++ b/fs/bcachefs/btree_gc.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "bkey.h"
+#include "btree_types.h"
+
+int bch2_check_topology(struct bch_fs *);
+int bch2_gc(struct bch_fs *, bool, bool);
+int bch2_gc_gens(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	return  cmp_int(l.phase, r.phase) ?:
+		bpos_cmp(l.pos, r.pos) ?:
+		cmp_int(l.level, r.level);
+}
+
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+	switch (id) {
+#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
+	BCH_BTREE_IDS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+					 struct bpos pos, unsigned level)
+{
+	return (struct gc_pos) {
+		.phase	= btree_id_to_gc_phase(id),
+		.pos	= pos,
+		.level	= level,
+	};
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+static inline void bch2_do_gc_gens(struct bch_fs *c)
+{
+	atomic_inc(&c->kick_gc);
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
new file mode 100644
index 000000000000..5a720f0cd5a6
--- /dev/null
+++ b/fs/bcachefs/btree_io.c
@@ -0,0 +1,2297 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+
+	clear_btree_node_write_in_flight_inner(b);
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+	bch2_assert_btree_nodes_not_locked();
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static void verify_no_dups(struct btree *b,
+			   struct bkey_packed *start,
+			   struct bkey_packed *end)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bkey_packed *k, *p;
+
+	if (start == end)
+		return;
+
+	for (p = start, k = bkey_p_next(start);
+	     k != end;
+	     p = k, k = bkey_p_next(k)) {
+		struct bkey l = bkey_unpack_key(b, p);
+		struct bkey r = bkey_unpack_key(b, k);
+
+		BUG_ON(bpos_ge(l.p, bkey_start_pos(&r)));
+	}
+#endif
+}
+
+static void set_needs_whiteout(struct bset *i, int v)
+{
+	struct bkey_packed *k;
+
+	for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+		k->needs_whiteout = v;
+}
+
+static void btree_bounce_free(struct bch_fs *c, size_t size,
+			      bool used_mempool, void *p)
+{
+	if (used_mempool)
+		mempool_free(p, &c->btree_bounce_pool);
+	else
+		vpfree(p, size);
+}
+
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+				bool *used_mempool)
+{
+	unsigned flags = memalloc_nofs_save();
+	void *p;
+
+	BUG_ON(size > btree_bytes(c));
+
+	*used_mempool = false;
+	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	if (!p) {
+		*used_mempool = true;
+		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+	}
+	memalloc_nofs_restore(flags);
+	return p;
+}
+
+static void sort_bkey_ptrs(const struct btree *bt,
+			   struct bkey_packed **ptrs, unsigned nr)
+{
+	unsigned n = nr, a = nr / 2, b, c, d;
+
+	if (!a)
+		return;
+
+	/* Heap sort: see lib/sort.c: */
+	while (1) {
+		if (a)
+			a--;
+		else if (--n)
+			swap(ptrs[0], ptrs[n]);
+		else
+			break;
+
+		for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
+			b = bch2_bkey_cmp_packed(bt,
+					    ptrs[c],
+					    ptrs[d]) >= 0 ? c : d;
+		if (d == n)
+			b = c;
+
+		while (b != a &&
+		       bch2_bkey_cmp_packed(bt,
+				       ptrs[a],
+				       ptrs[b]) >= 0)
+			b = (b - 1) / 2;
+		c = b;
+		while (b != a) {
+			b = (b - 1) / 2;
+			swap(ptrs[b], ptrs[c]);
+		}
+	}
+}
+
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
+	bool used_mempool = false;
+	size_t bytes = b->whiteout_u64s * sizeof(u64);
+
+	if (!b->whiteout_u64s)
+		return;
+
+	new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
+
+	for (k = unwritten_whiteouts_start(c, b);
+	     k != unwritten_whiteouts_end(c, b);
+	     k = bkey_p_next(k))
+		*--ptrs = k;
+
+	sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
+
+	k = new_whiteouts;
+
+	while (ptrs != ptrs_end) {
+		bkey_p_copy(k, *ptrs);
+		k = bkey_p_next(k);
+		ptrs++;
+	}
+
+	verify_no_dups(b, new_whiteouts,
+		       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+
+	memcpy_u64s(unwritten_whiteouts_start(c, b),
+		    new_whiteouts, b->whiteout_u64s);
+
+	btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
+}
+
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
+				bool compacting, enum compact_mode mode)
+{
+	if (!bset_dead_u64s(b, t))
+		return false;
+
+	switch (mode) {
+	case COMPACT_LAZY:
+		return should_compact_bset_lazy(b, t) ||
+			(compacting && !bset_written(b, bset(b, t)));
+	case COMPACT_ALL:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
+{
+	struct bset_tree *t;
+	bool ret = false;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k, *n, *out, *start, *end;
+		struct btree_node_entry *src = NULL, *dst = NULL;
+
+		if (t != b->set && !bset_written(b, i)) {
+			src = container_of(i, struct btree_node_entry, keys);
+			dst = max(write_block(b),
+				  (void *) btree_bkey_last(b, t - 1));
+		}
+
+		if (src != dst)
+			ret = true;
+
+		if (!should_compact_bset(b, t, ret, mode)) {
+			if (src != dst) {
+				memmove(dst, src, sizeof(*src) +
+					le16_to_cpu(src->keys.u64s) *
+					sizeof(u64));
+				i = &dst->keys;
+				set_btree_bset(b, t, i);
+			}
+			continue;
+		}
+
+		start	= btree_bkey_first(b, t);
+		end	= btree_bkey_last(b, t);
+
+		if (src != dst) {
+			memmove(dst, src, sizeof(*src));
+			i = &dst->keys;
+			set_btree_bset(b, t, i);
+		}
+
+		out = i->start;
+
+		for (k = start; k != end; k = n) {
+			n = bkey_p_next(k);
+
+			if (!bkey_deleted(k)) {
+				bkey_p_copy(out, k);
+				out = bkey_p_next(out);
+			} else {
+				BUG_ON(k->needs_whiteout);
+			}
+		}
+
+		i->u64s = cpu_to_le16((u64 *) out - i->_data);
+		set_btree_bset_end(b, t);
+		bch2_bset_set_no_aux_tree(b, t);
+		ret = true;
+	}
+
+	bch2_verify_btree_nr_keys(b);
+
+	bch2_btree_build_aux_trees(b);
+
+	return ret;
+}
+
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
+			    enum compact_mode mode)
+{
+	return bch2_drop_whiteouts(b, mode);
+}
+
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
+			    unsigned start_idx,
+			    unsigned end_idx,
+			    bool filter_whiteouts)
+{
+	struct btree_node *out;
+	struct sort_iter_stack sort_iter;
+	struct bset_tree *t;
+	struct bset *start_bset = bset(b, &b->set[start_idx]);
+	bool used_mempool = false;
+	u64 start_time, seq = 0;
+	unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
+	bool sorting_entire_node = start_idx == 0 &&
+		end_idx == b->nsets;
+
+	sort_iter_stack_init(&sort_iter, b);
+
+	for (t = b->set + start_idx;
+	     t < b->set + end_idx;
+	     t++) {
+		u64s += le16_to_cpu(bset(b, t)->u64s);
+		sort_iter_add(&sort_iter.iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+	}
+
+	bytes = sorting_entire_node
+		? btree_bytes(c)
+		: __vstruct_bytes(struct btree_node, u64s);
+
+	out = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	start_time = local_clock();
+
+	u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
+
+	out->keys.u64s = cpu_to_le16(u64s);
+
+	BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
+
+	if (sorting_entire_node)
+		bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+				       start_time);
+
+	/* Make sure we preserve bset journal_seq: */
+	for (t = b->set + start_idx; t < b->set + end_idx; t++)
+		seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+	start_bset->journal_seq = cpu_to_le64(seq);
+
+	if (sorting_entire_node) {
+		u64s = le16_to_cpu(out->keys.u64s);
+
+		BUG_ON(bytes != btree_bytes(c));
+
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+		*out = *b->data;
+		out->keys.u64s = cpu_to_le16(u64s);
+		swap(out, b->data);
+		set_btree_bset(b, b->set, &b->data->keys);
+	} else {
+		start_bset->u64s = out->keys.u64s;
+		memcpy_u64s(start_bset->start,
+			    out->keys.start,
+			    le16_to_cpu(out->keys.u64s));
+	}
+
+	for (i = start_idx + 1; i < end_idx; i++)
+		b->nr.bset_u64s[start_idx] +=
+			b->nr.bset_u64s[i];
+
+	b->nsets -= shift;
+
+	for (i = start_idx + 1; i < b->nsets; i++) {
+		b->nr.bset_u64s[i]	= b->nr.bset_u64s[i + shift];
+		b->set[i]		= b->set[i + shift];
+	}
+
+	for (i = b->nsets; i < MAX_BSETS; i++)
+		b->nr.bset_u64s[i] = 0;
+
+	set_btree_bset_end(b, &b->set[start_idx]);
+	bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
+
+	btree_bounce_free(c, bytes, used_mempool, out);
+
+	bch2_verify_btree_nr_keys(b);
+}
+
+void bch2_btree_sort_into(struct bch_fs *c,
+			 struct btree *dst,
+			 struct btree *src)
+{
+	struct btree_nr_keys nr;
+	struct btree_node_iter src_iter;
+	u64 start_time = local_clock();
+
+	BUG_ON(dst->nsets != 1);
+
+	bch2_bset_set_no_aux_tree(dst, dst->set);
+
+	bch2_btree_node_iter_init_from_start(&src_iter, src);
+
+	nr = bch2_sort_repack(btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+			       start_time);
+
+	set_btree_bset_end(dst, dst->set);
+
+	dst->nr.live_u64s	+= nr.live_u64s;
+	dst->nr.bset_u64s[0]	+= nr.bset_u64s[0];
+	dst->nr.packed_keys	+= nr.packed_keys;
+	dst->nr.unpacked_keys	+= nr.unpacked_keys;
+
+	bch2_verify_btree_nr_keys(dst);
+}
+
+/*
+ * We're about to add another bset to the btree node, so if there's currently
+ * too many bsets - sort some of them together:
+ */
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
+{
+	unsigned unwritten_idx;
+	bool ret = false;
+
+	for (unwritten_idx = 0;
+	     unwritten_idx < b->nsets;
+	     unwritten_idx++)
+		if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
+			break;
+
+	if (b->nsets - unwritten_idx > 1) {
+		btree_node_sort(c, b, unwritten_idx,
+				b->nsets, false);
+		ret = true;
+	}
+
+	if (unwritten_idx > 1) {
+		btree_node_sort(c, b, 0, unwritten_idx, false);
+		ret = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_build_aux_trees(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		bch2_bset_build_aux_tree(b, t,
+				!bset_written(b, bset(b, t)) &&
+				t == bset_tree_last(b));
+}
+
+/*
+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
+ *
+ * The first bset is going to be of similar order to the size of the node, the
+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
+ * memmove on insert from being too expensive: the middle bset should, ideally,
+ * be the geometric mean of the first and the last.
+ *
+ * Returns true if the middle bset is greater than that geometric mean:
+ */
+static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
+{
+	unsigned mid_u64s_bits =
+		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
+
+	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
+}
+
+/*
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
+ * inserted into
+ *
+ * Safe to call if there already is an unwritten bset - will only add a new bset
+ * if @b doesn't already have one.
+ *
+ * Returns true if we sorted (i.e. invalidated iterators
+ */
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_node_entry *bne;
+	bool reinit_iter = false;
+
+	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
+	BUG_ON(bset_written(b, bset(b, &b->set[1])));
+	BUG_ON(btree_node_just_written(b));
+
+	if (b->nsets == MAX_BSETS &&
+	    !btree_node_write_in_flight(b) &&
+	    should_compact_all(c, b)) {
+		bch2_btree_node_write(c, b, SIX_LOCK_write,
+				      BTREE_WRITE_init_next_bset);
+		reinit_iter = true;
+	}
+
+	if (b->nsets == MAX_BSETS &&
+	    btree_node_compact(c, b))
+		reinit_iter = true;
+
+	BUG_ON(b->nsets >= MAX_BSETS);
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	if (reinit_iter)
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+			  struct bch_dev *ca,
+			  struct btree *b, struct bset *i,
+			  unsigned offset, int write)
+{
+	prt_printf(out, bch2_log_msg(c, "%s"),
+		   write == READ
+		   ? "error validating btree node "
+		   : "corrupt btree node before write ");
+	if (ca)
+		prt_printf(out, "on %s ", ca->name);
+	prt_printf(out, "at btree ");
+	bch2_btree_pos_to_text(out, c, b);
+
+	prt_printf(out, "\n  node offset %u", b->written);
+	if (i)
+		prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
+	prt_str(out, ": ");
+}
+
+__printf(9, 10)
+static int __btree_err(int ret,
+		       struct bch_fs *c,
+		       struct bch_dev *ca,
+		       struct btree *b,
+		       struct bset *i,
+		       int write,
+		       bool have_retry,
+		       enum bch_sb_error_id err_type,
+		       const char *fmt, ...)
+{
+	struct printbuf out = PRINTBUF;
+	va_list args;
+
+	btree_err_msg(&out, c, ca, b, i, b->written, write);
+
+	va_start(args, fmt);
+	prt_vprintf(&out, fmt, args);
+	va_end(args);
+
+	if (write == WRITE) {
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = c->opts.errors == BCH_ON_ERROR_continue
+			? 0
+			: -BCH_ERR_fsck_errors_not_fixed;
+		goto out;
+	}
+
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+		ret = -BCH_ERR_btree_node_read_err_fixable;
+	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+		ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+	if (ret != -BCH_ERR_btree_node_read_err_fixable)
+		bch2_sb_error_count(c, err_type);
+
+	switch (ret) {
+	case -BCH_ERR_btree_node_read_err_fixable:
+		ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+		if (ret != -BCH_ERR_fsck_fix &&
+		    ret != -BCH_ERR_fsck_ignore)
+			goto fsck_err;
+		ret = -BCH_ERR_fsck_fix;
+		break;
+	case -BCH_ERR_btree_node_read_err_want_retry:
+	case -BCH_ERR_btree_node_read_err_must_retry:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		break;
+	case -BCH_ERR_btree_node_read_err_bad_node:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		bch2_topology_error(c);
+		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+		break;
+	case -BCH_ERR_btree_node_read_err_incompatible:
+		bch2_print_string_as_lines(KERN_ERR, out.buf);
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+		break;
+	default:
+		BUG();
+	}
+out:
+fsck_err:
+	printbuf_exit(&out);
+	return ret;
+}
+
+#define btree_err(type, c, ca, b, i, _err_type, msg, ...)		\
+({									\
+	int _ret = __btree_err(type, c, ca, b, i, write, have_retry,	\
+			       BCH_FSCK_ERR_##_err_type,		\
+			       msg, ##__VA_ARGS__);			\
+									\
+	if (_ret != -BCH_ERR_fsck_fix) {				\
+		ret = _ret;						\
+		goto fsck_err;						\
+	}								\
+									\
+	*saw_error = true;						\
+})
+
+#define btree_err_on(cond, ...)	((cond) ? btree_err(__VA_ARGS__) : false)
+
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t) {
+		struct bset *i = bset(b, t);
+		struct bkey_packed *k;
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+				break;
+
+		if (k != i->start) {
+			unsigned shift = (u64 *) k - (u64 *) i->start;
+
+			memmove_u64s_down(i->start, k,
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+			set_btree_bset_end(b, t);
+		}
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k))
+			if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+				break;
+
+		if (k != vstruct_last(i)) {
+			i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+			set_btree_bset_end(b, t);
+		}
+	}
+
+	/*
+	 * Always rebuild search trees: eytzinger search tree nodes directly
+	 * depend on the values of min/max key:
+	 */
+	bch2_bset_set_no_aux_tree(b, b->set);
+	bch2_btree_build_aux_trees(b);
+
+	struct bkey_s_c k;
+	struct bkey unpacked;
+	struct btree_node_iter iter;
+	for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
+		BUG_ON(bpos_gt(k.k->p, b->data->max_key));
+	}
+}
+
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+			 struct btree *b, struct bset *i,
+			 unsigned offset, unsigned sectors,
+			 int write, bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	btree_err_on(!bch2_version_compatible(version),
+		     -BCH_ERR_btree_node_read_err_incompatible,
+		     c, ca, b, i,
+		     btree_node_unsupported_version,
+		     "unsupported bset version %u.%u",
+		     BCH_VERSION_MAJOR(version),
+		     BCH_VERSION_MINOR(version));
+
+	if (btree_err_on(version < c->sb.version_min,
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, NULL, b, i,
+			 btree_node_bset_older_than_sb_min,
+			 "bset version %u older than superblock version_min %u",
+			 version, c->sb.version_min)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version_min = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (btree_err_on(BCH_VERSION_MAJOR(version) >
+			 BCH_VERSION_MAJOR(c->sb.version),
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, NULL, b, i,
+			 btree_node_bset_newer_than_sb,
+			 "bset version %u newer than superblock version %u",
+			 version, c->sb.version)) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->version = cpu_to_le16(version);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+		     -BCH_ERR_btree_node_read_err_incompatible,
+		     c, ca, b, i,
+		     btree_node_unsupported_version,
+		     "BSET_SEPARATE_WHITEOUTS no longer supported");
+
+	if (btree_err_on(offset + sectors > btree_sectors(c),
+			 -BCH_ERR_btree_node_read_err_fixable,
+			 c, ca, b, i,
+			 bset_past_end_of_btree_node,
+			 "bset past end of btree node")) {
+		i->u64s = 0;
+		ret = 0;
+		goto out;
+	}
+
+	btree_err_on(offset && !i->u64s,
+		     -BCH_ERR_btree_node_read_err_fixable,
+		     c, ca, b, i,
+		     bset_empty,
+		     "empty bset");
+
+	btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+		     -BCH_ERR_btree_node_read_err_want_retry,
+		     c, ca, b, i,
+		     bset_wrong_sector_offset,
+		     "bset at wrong sector offset");
+
+	if (!offset) {
+		struct btree_node *bn =
+			container_of(i, struct btree_node, keys);
+		/* These indicate that we read the wrong btree node: */
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			/* XXX endianness */
+			btree_err_on(bp->seq != bn->keys.seq,
+				     -BCH_ERR_btree_node_read_err_must_retry,
+				     c, ca, b, NULL,
+				     bset_bad_seq,
+				     "incorrect sequence number (wrong btree node)");
+		}
+
+		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i,
+			     btree_node_bad_btree,
+			     "incorrect btree id");
+
+		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i,
+			     btree_node_bad_level,
+			     "incorrect level");
+
+		if (!write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+			struct bch_btree_ptr_v2 *bp =
+				&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+			if (BTREE_PTR_RANGE_UPDATED(bp)) {
+				b->data->min_key = bp->min_key;
+				b->data->max_key = b->key.k.p;
+			}
+
+			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
+				     -BCH_ERR_btree_node_read_err_must_retry,
+				     c, ca, b, NULL,
+				     btree_node_bad_min_key,
+				     "incorrect min_key: got %s should be %s",
+				     (printbuf_reset(&buf1),
+				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
+				     (printbuf_reset(&buf2),
+				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
+		}
+
+		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, i,
+			     btree_node_bad_max_key,
+			     "incorrect max key %s",
+			     (printbuf_reset(&buf1),
+			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
+
+		if (write)
+			compat_btree_node(b->c.level, b->c.btree_id, version,
+					  BSET_BIG_ENDIAN(i), write, bn);
+
+		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+			     -BCH_ERR_btree_node_read_err_bad_node,
+			     c, ca, b, i,
+			     btree_node_bad_format,
+			     "invalid bkey format: %s\n  %s", buf1.buf,
+			     (printbuf_reset(&buf2),
+			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+		printbuf_reset(&buf1);
+
+		compat_bformat(b->c.level, b->c.btree_id, version,
+			       BSET_BIG_ENDIAN(i), write,
+			       &bn->format);
+	}
+out:
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+static int bset_key_invalid(struct bch_fs *c, struct btree *b,
+			    struct bkey_s_c k,
+			    bool updated_range, int rw,
+			    struct printbuf *err)
+{
+	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
+		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+}
+
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
+			 struct bset *i, int write,
+			 bool have_retry, bool *saw_error)
+{
+	unsigned version = le16_to_cpu(i->version);
+	struct bkey_packed *k, *prev = NULL;
+	struct printbuf buf = PRINTBUF;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+	int ret = 0;
+
+	for (k = i->start;
+	     k != vstruct_last(i);) {
+		struct bkey_s u;
+		struct bkey tmp;
+
+		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i,
+				 btree_node_bkey_past_bset_end,
+				 "key extends past end of bset")) {
+			i->u64s = cpu_to_le16((u64 *) k - i->_data);
+			break;
+		}
+
+		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i,
+				 btree_node_bkey_bad_format,
+				 "invalid bkey format %u", k->format)) {
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		/* XXX: validate k->u64s */
+		if (!write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		u = __bkey_disassemble(b, k, &tmp);
+
+		printbuf_reset(&buf);
+		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
+			printbuf_reset(&buf);
+			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+			prt_printf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+			btree_err(-BCH_ERR_btree_node_read_err_fixable,
+				  c, NULL, b, i,
+				  btree_node_bad_bkey,
+				  "invalid bkey: %s", buf.buf);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			continue;
+		}
+
+		if (write)
+			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
+				    BSET_BIG_ENDIAN(i), write,
+				    &b->format, k);
+
+		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+			struct bkey up = bkey_unpack_key(b, prev);
+
+			printbuf_reset(&buf);
+			prt_printf(&buf, "keys out of order: ");
+			bch2_bkey_to_text(&buf, &up);
+			prt_printf(&buf, " > ");
+			bch2_bkey_to_text(&buf, u.k);
+
+			bch2_dump_bset(c, b, i, 0);
+
+			if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+				      c, NULL, b, i,
+				      btree_node_bkey_out_of_order,
+				      "%s", buf.buf)) {
+				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+				memmove_u64s_down(k, bkey_p_next(k),
+						  (u64 *) vstruct_end(i) - (u64 *) k);
+				continue;
+			}
+		}
+
+		prev = k;
+		k = bkey_p_next(k);
+	}
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+			      struct btree *b, bool have_retry, bool *saw_error)
+{
+	struct btree_node_entry *bne;
+	struct sort_iter *iter;
+	struct btree_node *sorted;
+	struct bkey_packed *k;
+	struct bch_extent_ptr *ptr;
+	struct bset *i;
+	bool used_mempool, blacklisted;
+	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
+	unsigned u64s;
+	unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0, retry_read = 0, write = READ;
+
+	b->version_ondisk = U16_MAX;
+	/* We might get called multiple times on read retry: */
+	b->written = 0;
+
+	iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
+	sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
+
+	if (bch2_meta_read_fault("btree"))
+		btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+			  c, ca, b, NULL,
+			  btree_node_fault_injected,
+			  "dynamic fault");
+
+	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
+		     -BCH_ERR_btree_node_read_err_must_retry,
+		     c, ca, b, NULL,
+		     btree_node_bad_magic,
+		     "bad magic: want %llx, got %llx",
+		     bset_magic(c), le64_to_cpu(b->data->magic));
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bch_btree_ptr_v2 *bp =
+			&bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+		btree_err_on(b->data->keys.seq != bp->seq,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, NULL,
+			     btree_node_bad_seq,
+			     "got wrong btree node (seq %llx want %llx)",
+			     b->data->keys.seq, bp->seq);
+	} else {
+		btree_err_on(!b->data->keys.seq,
+			     -BCH_ERR_btree_node_read_err_must_retry,
+			     c, ca, b, NULL,
+			     btree_node_bad_seq,
+			     "bad btree header: seq 0");
+	}
+
+	while (b->written < (ptr_written ?: btree_sectors(c))) {
+		unsigned sectors;
+		struct nonce nonce;
+		bool first = !b->written;
+		bool csum_bad;
+
+		if (!b->written) {
+			i = &b->data->keys;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, i,
+				     bset_unknown_csum,
+				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+			nonce = btree_nonce(i, b->written << 9);
+
+			csum_bad = bch2_crc_cmp(b->data->csum,
+				csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+			if (csum_bad)
+				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+			btree_err_on(csum_bad,
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, i,
+				     bset_bad_csum,
+				     "invalid checksum");
+
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i", ret))
+				goto fsck_err;
+
+			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
+				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+				     -BCH_ERR_btree_node_read_err_incompatible,
+				     c, NULL, b, NULL,
+				     btree_node_unsupported_version,
+				     "btree node does not have NEW_EXTENT_OVERWRITE set");
+
+			sectors = vstruct_sectors(b->data, c->block_bits);
+		} else {
+			bne = write_block(b);
+			i = &bne->keys;
+
+			if (i->seq != b->data->keys.seq)
+				break;
+
+			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, i,
+				     bset_unknown_csum,
+				     "unknown checksum type %llu", BSET_CSUM_TYPE(i));
+
+			nonce = btree_nonce(i, b->written << 9);
+			csum_bad = bch2_crc_cmp(bne->csum,
+				csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+			if (csum_bad)
+				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+			btree_err_on(csum_bad,
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, i,
+				     bset_bad_csum,
+				     "invalid checksum");
+
+			ret = bset_encrypt(c, i, b->written << 9);
+			if (bch2_fs_fatal_err_on(ret, c,
+					"error decrypting btree node: %i\n", ret))
+				goto fsck_err;
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		b->version_ondisk = min(b->version_ondisk,
+					le16_to_cpu(i->version));
+
+		ret = validate_bset(c, ca, b, i, b->written, sectors,
+				    READ, have_retry, saw_error);
+		if (ret)
+			goto fsck_err;
+
+		if (!b->written)
+			btree_node_set_format(b, b->data->format);
+
+		ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+		if (ret)
+			goto fsck_err;
+
+		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
+
+		blacklisted = bch2_journal_seq_is_blacklisted(c,
+					le64_to_cpu(i->journal_seq),
+					true);
+
+		btree_err_on(blacklisted && first,
+			     -BCH_ERR_btree_node_read_err_fixable,
+			     c, ca, b, i,
+			     bset_blacklisted_journal_seq,
+			     "first btree node bset has blacklisted journal seq (%llu)",
+			     le64_to_cpu(i->journal_seq));
+
+		btree_err_on(blacklisted && ptr_written,
+			     -BCH_ERR_btree_node_read_err_fixable,
+			     c, ca, b, i,
+			     first_bset_blacklisted_journal_seq,
+			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+			     le64_to_cpu(i->journal_seq),
+			     b->written, b->written + sectors, ptr_written);
+
+		b->written += sectors;
+
+		if (blacklisted && !first)
+			continue;
+
+		sort_iter_add(iter,
+			      vstruct_idx(i, 0),
+			      vstruct_last(i));
+	}
+
+	if (ptr_written) {
+		btree_err_on(b->written < ptr_written,
+			     -BCH_ERR_btree_node_read_err_want_retry,
+			     c, ca, b, NULL,
+			     btree_node_data_missing,
+			     "btree node data missing: expected %u sectors, found %u",
+			     ptr_written, b->written);
+	} else {
+		for (bne = write_block(b);
+		     bset_byte_offset(b, bne) < btree_bytes(c);
+		     bne = (void *) bne + block_bytes(c))
+			btree_err_on(bne->keys.seq == b->data->keys.seq &&
+				     !bch2_journal_seq_is_blacklisted(c,
+								      le64_to_cpu(bne->keys.journal_seq),
+								      true),
+				     -BCH_ERR_btree_node_read_err_want_retry,
+				     c, ca, b, NULL,
+				     btree_node_bset_after_end,
+				     "found bset signature after last bset");
+	}
+
+	sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+	sorted->keys.u64s = 0;
+
+	set_btree_bset(b, b->set, &b->data->keys);
+
+	b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+
+	u64s = le16_to_cpu(sorted->keys.u64s);
+	*sorted = *b->data;
+	sorted->keys.u64s = cpu_to_le16(u64s);
+	swap(sorted, b->data);
+	set_btree_bset(b, b->set, &b->data->keys);
+	b->nsets = 1;
+
+	BUG_ON(b->nr.live_u64s != u64s);
+
+	btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+
+	if (updated_range)
+		bch2_btree_node_drop_keys_outside_node(b);
+
+	i = &b->data->keys;
+	for (k = i->start; k != vstruct_last(i);) {
+		struct bkey tmp;
+		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+
+		printbuf_reset(&buf);
+
+		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+		    (bch2_inject_invalid_keys &&
+		     !bversion_cmp(u.k->version, MAX_VERSION))) {
+			printbuf_reset(&buf);
+
+			prt_printf(&buf, "invalid bkey: ");
+			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+			prt_printf(&buf, "\n  ");
+			bch2_bkey_val_to_text(&buf, c, u.s_c);
+
+			btree_err(-BCH_ERR_btree_node_read_err_fixable,
+				  c, NULL, b, i,
+				  btree_node_bad_bkey,
+				  "%s", buf.buf);
+
+			btree_keys_account_key_drop(&b->nr, 0, k);
+
+			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+			memmove_u64s_down(k, bkey_p_next(k),
+					  (u64 *) vstruct_end(i) - (u64 *) k);
+			set_btree_bset_end(b, b->set);
+			continue;
+		}
+
+		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+			bp.v->mem_ptr = 0;
+		}
+
+		k = bkey_p_next(k);
+	}
+
+	bch2_bset_build_aux_tree(b, b->set, false);
+
+	set_needs_whiteout(btree_bset_first(b), true);
+
+	btree_node_reset_sib_u64s(b);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+		struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca2->mi.state != BCH_MEMBER_STATE_rw)
+			set_btree_node_need_rewrite(b);
+	}
+
+	if (!ptr_written)
+		set_btree_node_need_rewrite(b);
+out:
+	mempool_free(iter, &c->fill_iter);
+	printbuf_exit(&buf);
+	return retry_read;
+fsck_err:
+	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+	    ret == -BCH_ERR_btree_node_read_err_must_retry)
+		retry_read = 1;
+	else
+		set_btree_node_read_error(b);
+	goto out;
+}
+
+static void btree_node_read_work(struct work_struct *work)
+{
+	struct btree_read_bio *rb =
+		container_of(work, struct btree_read_bio, work);
+	struct bch_fs *c	= rb->c;
+	struct btree *b		= rb->b;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+	struct bio *bio		= &rb->bio;
+	struct bch_io_failures failed = { .nr = 0 };
+	struct printbuf buf = PRINTBUF;
+	bool saw_error = false;
+	bool retry = false;
+	bool can_retry;
+
+	goto start;
+	while (1) {
+		retry = true;
+		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
+		bio->bi_iter.bi_size	= btree_bytes(c);
+
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		printbuf_reset(&buf);
+		bch2_btree_pos_to_text(&buf, c, b);
+		bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+				   "btree read error %s for %s",
+				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		bch2_mark_io_failure(&failed, &rb->pick);
+
+		can_retry = bch2_bkey_pick_read_device(c,
+				bkey_i_to_s_c(&b->key),
+				&failed, &rb->pick) > 0;
+
+		if (!bio->bi_status &&
+		    !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
+			if (retry)
+				bch_info(c, "retry success");
+			break;
+		}
+
+		saw_error = true;
+
+		if (!can_retry) {
+			set_btree_node_read_error(b);
+			break;
+		}
+	}
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+			       rb->start_time);
+	bio_put(&rb->bio);
+
+	if (saw_error && !btree_node_read_error(b)) {
+		printbuf_reset(&buf);
+		bch2_bpos_to_text(&buf, b->key.k.p);
+		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
+
+		bch2_btree_node_rewrite_async(c, b);
+	}
+
+	printbuf_exit(&buf);
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct btree		*b;
+	unsigned		nr;
+	void			*buf[BCH_REPLICAS_MAX];
+	struct bio		*bio[BCH_REPLICAS_MAX];
+	blk_status_t		err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+	unsigned offset = 0;
+
+	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
+		return 0;
+
+	while (offset < btree_sectors(c)) {
+		if (!offset) {
+			offset += vstruct_sectors(bn, c->block_bits);
+		} else {
+			bne = data + (offset << 9);
+			if (bne->keys.seq != bn->keys.seq)
+				break;
+			offset += vstruct_sectors(bne, c->block_bits);
+		}
+	}
+
+	return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+	struct btree_node *bn = data;
+	struct btree_node_entry *bne;
+
+	if (!offset)
+		return false;
+
+	while (offset < btree_sectors(c)) {
+		bne = data + (offset << 9);
+		if (bne->keys.seq == bn->keys.seq)
+			return true;
+		offset++;
+	}
+
+	return false;
+	return offset;
+}
+
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
+{
+	closure_type(ra, struct btree_node_read_all, cl);
+	struct bch_fs *c = ra->c;
+	struct btree *b = ra->b;
+	struct printbuf buf = PRINTBUF;
+	bool dump_bset_maps = false;
+	bool have_retry = false;
+	int ret = 0, best = -1, write = READ;
+	unsigned i, written = 0, written2 = 0;
+	__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+	bool _saw_error = false, *saw_error = &_saw_error;
+
+	for (i = 0; i < ra->nr; i++) {
+		struct btree_node *bn = ra->buf[i];
+
+		if (ra->err[i])
+			continue;
+
+		if (le64_to_cpu(bn->magic) != bset_magic(c) ||
+		    (seq && seq != bn->keys.seq))
+			continue;
+
+		if (best < 0) {
+			best = i;
+			written = btree_node_sectors_written(c, bn);
+			continue;
+		}
+
+		written2 = btree_node_sectors_written(c, ra->buf[i]);
+		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL,
+				 btree_node_replicas_sectors_written_mismatch,
+				 "btree node sectors written mismatch: %u != %u",
+				 written, written2) ||
+		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL,
+				 btree_node_bset_after_end,
+				 "found bset signature after last bset") ||
+		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, NULL,
+				 btree_node_replicas_data_mismatch,
+				 "btree node replicas content mismatch"))
+			dump_bset_maps = true;
+
+		if (written2 > written) {
+			written = written2;
+			best = i;
+		}
+	}
+fsck_err:
+	if (dump_bset_maps) {
+		for (i = 0; i < ra->nr; i++) {
+			struct btree_node *bn = ra->buf[i];
+			struct btree_node_entry *bne = NULL;
+			unsigned offset = 0, sectors;
+			bool gap = false;
+
+			if (ra->err[i])
+				continue;
+
+			printbuf_reset(&buf);
+
+			while (offset < btree_sectors(c)) {
+				if (!offset) {
+					sectors = vstruct_sectors(bn, c->block_bits);
+				} else {
+					bne = ra->buf[i] + (offset << 9);
+					if (bne->keys.seq != bn->keys.seq)
+						break;
+					sectors = vstruct_sectors(bne, c->block_bits);
+				}
+
+				prt_printf(&buf, " %u-%u", offset, offset + sectors);
+				if (bne && bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+					prt_printf(&buf, "*");
+				offset += sectors;
+			}
+
+			while (offset < btree_sectors(c)) {
+				bne = ra->buf[i] + (offset << 9);
+				if (bne->keys.seq == bn->keys.seq) {
+					if (!gap)
+						prt_printf(&buf, " GAP");
+					gap = true;
+
+					sectors = vstruct_sectors(bne, c->block_bits);
+					prt_printf(&buf, " %u-%u", offset, offset + sectors);
+					if (bch2_journal_seq_is_blacklisted(c,
+							le64_to_cpu(bne->keys.journal_seq), false))
+						prt_printf(&buf, "*");
+				}
+				offset++;
+			}
+
+			bch_err(c, "replica %u:%s", i, buf.buf);
+		}
+	}
+
+	if (best >= 0) {
+		memcpy(b->data, ra->buf[best], btree_bytes(c));
+		ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+	} else {
+		ret = -1;
+	}
+
+	if (ret)
+		set_btree_node_read_error(b);
+	else if (*saw_error)
+		bch2_btree_node_rewrite_async(c, b);
+
+	for (i = 0; i < ra->nr; i++) {
+		mempool_free(ra->buf[i], &c->btree_bounce_pool);
+		bio_put(ra->bio[i]);
+	}
+
+	closure_debug_destroy(&ra->cl);
+	kfree(ra);
+	printbuf_exit(&buf);
+
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+	struct btree_read_bio *rb =
+		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
+	struct btree_node_read_all *ra = rb->ra;
+
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
+
+	ra->err[rb->idx] = bio->bi_status;
+	closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+	struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded pick;
+	struct btree_node_read_all *ra;
+	unsigned i;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+
+	closure_init(&ra->cl, NULL);
+	ra->c	= c;
+	ra->b	= b;
+	ra->nr	= bch2_bkey_nr_ptrs(k);
+
+	for (i = 0; i < ra->nr; i++) {
+		ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+		ra->bio[i] = bio_alloc_bioset(NULL,
+					      buf_pages(ra->buf[i], btree_bytes(c)),
+					      REQ_OP_READ|REQ_SYNC|REQ_META,
+					      GFP_NOFS,
+					      &c->btree_bio);
+	}
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+		struct btree_read_bio *rb =
+			container_of(ra->bio[i], struct btree_read_bio, bio);
+		rb->c			= c;
+		rb->b			= b;
+		rb->ra			= ra;
+		rb->start_time		= local_clock();
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+		rb->idx			= i;
+		rb->pick		= pick;
+		rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+		rb->bio.bi_end_io	= btree_node_read_all_replicas_endio;
+		bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+		if (rb->have_ioref) {
+			this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+				     bio_sectors(&rb->bio));
+			bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+			closure_get(&ra->cl);
+			submit_bio(&rb->bio);
+		} else {
+			ra->err[i] = BLK_STS_REMOVED;
+		}
+
+		i++;
+	}
+
+	if (sync) {
+		closure_sync(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
+	} else {
+		continue_at(&ra->cl, btree_node_read_all_replicas_done,
+			    c->io_complete_wq);
+	}
+
+	return 0;
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+			  bool sync)
+{
+	struct extent_ptr_decoded pick;
+	struct btree_read_bio *rb;
+	struct bch_dev *ca;
+	struct bio *bio;
+	int ret;
+
+	trace_and_count(c, btree_node_read, c, b);
+
+	if (bch2_verify_all_btree_replicas &&
+	    !btree_node_read_all_replicas(c, b, sync))
+		return;
+
+	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
+					 NULL, &pick);
+
+	if (ret <= 0) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_str(&buf, "btree node read error: no device to read from\n at ");
+		bch2_btree_pos_to_text(&buf, c, b);
+		bch_err(c, "%s", buf.buf);
+
+		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
+			bch2_fatal_error(c);
+
+		set_btree_node_read_error(b);
+		clear_btree_node_read_in_flight(b);
+		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+		printbuf_exit(&buf);
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	bio = bio_alloc_bioset(NULL,
+			       buf_pages(b->data, btree_bytes(c)),
+			       REQ_OP_READ|REQ_SYNC|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	rb = container_of(bio, struct btree_read_bio, bio);
+	rb->c			= c;
+	rb->b			= b;
+	rb->ra			= NULL;
+	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
+	rb->pick		= pick;
+	INIT_WORK(&rb->work, btree_node_read_work);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_end_io		= btree_node_read_endio;
+	bch2_bio_map(bio, b->data, btree_bytes(c));
+
+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
+		if (sync) {
+			submit_bio_wait(bio);
+
+			btree_node_read_work(&rb->work);
+		} else {
+			submit_bio(bio);
+		}
+	} else {
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(c->io_complete_wq, &rb->work);
+	}
+}
+
+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
+				  const struct bkey_i *k, unsigned level)
+{
+	struct bch_fs *c = trans->c;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(trans, level != 0);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	BUG_ON(IS_ERR(b));
+
+	bkey_copy(&b->key, k);
+	BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+
+	set_btree_node_read_in_flight(b);
+
+	bch2_btree_node_read(c, b, true);
+
+	if (btree_node_read_error(b)) {
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&b->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_btree_set_root_for_read(c, b);
+err:
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	return ret;
+}
+
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
+			const struct bkey_i *k, unsigned level)
+{
+	return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
+}
+
+static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
+				      struct btree_write *w)
+{
+	unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+	do {
+		old = new = v;
+		if (!(old & 1))
+			break;
+
+		new &= ~1UL;
+	} while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+	if (old & 1)
+		closure_put(&((struct btree_update *) new)->cl);
+
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+}
+
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_write *w = btree_prev_write(b);
+	unsigned long old, new, v;
+	unsigned type = 0;
+
+	bch2_btree_complete_write(c, b, w);
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		if ((old & (1U << BTREE_NODE_dirty)) &&
+		    (old & (1U << BTREE_NODE_need_write)) &&
+		    !(old & (1U << BTREE_NODE_never_write)) &&
+		    !(old & (1U << BTREE_NODE_write_blocked)) &&
+		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
+			new &= ~(1U << BTREE_NODE_dirty);
+			new &= ~(1U << BTREE_NODE_need_write);
+			new |=  (1U << BTREE_NODE_write_in_flight);
+			new |=  (1U << BTREE_NODE_write_in_flight_inner);
+			new |=  (1U << BTREE_NODE_just_written);
+			new ^=  (1U << BTREE_NODE_write_idx);
+
+			type = new & BTREE_WRITE_TYPE_MASK;
+			new &= ~BTREE_WRITE_TYPE_MASK;
+		} else {
+			new &= ~(1U << BTREE_NODE_write_in_flight);
+			new &= ~(1U << BTREE_NODE_write_in_flight_inner);
+		}
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	if (new & (1U << BTREE_NODE_write_in_flight))
+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
+	else
+		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+	__btree_node_write_done(c, b);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_put(trans);
+}
+
+static void btree_node_write_work(struct work_struct *work)
+{
+	struct btree_write_bio *wbio =
+		container_of(work, struct btree_write_bio, work);
+	struct bch_fs *c	= wbio->wbio.c;
+	struct btree *b		= wbio->wbio.bio.bi_private;
+	struct bch_extent_ptr *ptr;
+	int ret = 0;
+
+	btree_bounce_free(c,
+		wbio->data_bytes,
+		wbio->wbio.used_mempool,
+		wbio->data);
+
+	bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
+		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
+
+	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+		goto err;
+
+	if (wbio->wbio.first_btree_write) {
+		if (wbio->wbio.failed.nr) {
+
+		}
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
+					BCH_WATERMARK_reclaim|
+					BTREE_INSERT_JOURNAL_RECLAIM|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_NOCHECK_RW,
+					!wbio->wbio.failed.nr));
+		if (ret)
+			goto err;
+	}
+out:
+	bio_put(&wbio->wbio.bio);
+	btree_node_write_done(c, b);
+	return;
+err:
+	set_btree_node_noevict(b);
+	if (!bch2_err_matches(ret, EROFS))
+		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+	goto out;
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_write_bio *orig	= parent ?: wbio;
+	struct btree_write_bio *wb	= container_of(orig, struct btree_write_bio, wbio);
+	struct bch_fs *c		= wbio->c;
+	struct btree *b			= wbio->bio.bi_private;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+	unsigned long flags;
+
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+			       "btree write error: %s",
+			       bch2_blk_status_to_str(bio->bi_status)) ||
+	    bch2_meta_write_fault("btree")) {
+		spin_lock_irqsave(&c->btree_write_error_lock, flags);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
+		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+	}
+
+	if (wbio->have_ioref)
+		percpu_ref_put(&ca->io_ref);
+
+	if (parent) {
+		bio_put(bio);
+		bio_endio(&parent->bio);
+		return;
+	}
+
+	clear_btree_node_write_in_flight_inner(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
+	INIT_WORK(&wb->work, btree_node_write_work);
+	queue_work(c->btree_io_complete_wq, &wb->work);
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+				   struct bset *i, unsigned sectors)
+{
+	struct printbuf buf = PRINTBUF;
+	bool saw_error;
+	int ret;
+
+	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
+				BKEY_TYPE_btree, WRITE, &buf);
+
+	if (ret)
+		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
+	printbuf_exit(&buf);
+	if (ret)
+		return ret;
+
+	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
+		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+	if (ret) {
+		bch2_inconsistent_error(c);
+		dump_stack();
+	}
+
+	return ret;
+}
+
+static void btree_write_submit(struct work_struct *work)
+{
+	struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+	struct bch_extent_ptr *ptr;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+
+	bkey_copy(&tmp.k, &wbio->key);
+
+	bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr)
+		ptr->offset += wbio->sector_offset;
+
+	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree,
+				  &tmp.k, false);
+}
+
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
+{
+	struct btree_write_bio *wbio;
+	struct bset_tree *t;
+	struct bset *i;
+	struct btree_node *bn = NULL;
+	struct btree_node_entry *bne = NULL;
+	struct sort_iter_stack sort_iter;
+	struct nonce nonce;
+	unsigned bytes_to_write, sectors_to_write, bytes, u64s;
+	u64 seq = 0;
+	bool used_mempool;
+	unsigned long old, new;
+	bool validate_before_checksum = false;
+	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
+	void *data;
+	int ret;
+
+	if (flags & BTREE_WRITE_ALREADY_STARTED)
+		goto do_write;
+
+	/*
+	 * We may only have a read lock on the btree node - the dirty bit is our
+	 * "lock" against racing with other threads that may be trying to start
+	 * a write, we do a write iff we clear the dirty bit. Since setting the
+	 * dirty bit requires a write lock, we can't race with other threads
+	 * redirtying it:
+	 */
+	do {
+		old = new = READ_ONCE(b->flags);
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
+		    !(old & (1 << BTREE_NODE_need_write)))
+			return;
+
+		if (old &
+		    ((1 << BTREE_NODE_never_write)|
+		     (1 << BTREE_NODE_write_blocked)))
+			return;
+
+		if (b->written &&
+		    (old & (1 << BTREE_NODE_will_make_reachable)))
+			return;
+
+		if (old & (1 << BTREE_NODE_write_in_flight))
+			return;
+
+		if (flags & BTREE_WRITE_ONLY_IF_NEED)
+			type = new & BTREE_WRITE_TYPE_MASK;
+		new &= ~BTREE_WRITE_TYPE_MASK;
+
+		new &= ~(1 << BTREE_NODE_dirty);
+		new &= ~(1 << BTREE_NODE_need_write);
+		new |=  (1 << BTREE_NODE_write_in_flight);
+		new |=  (1 << BTREE_NODE_write_in_flight_inner);
+		new |=  (1 << BTREE_NODE_just_written);
+		new ^=  (1 << BTREE_NODE_write_idx);
+	} while (cmpxchg_acquire(&b->flags, old, new) != old);
+
+	if (new & (1U << BTREE_NODE_need_write))
+		return;
+do_write:
+	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
+
+	atomic_dec(&c->btree_cache.dirty);
+
+	BUG_ON(btree_node_fake(b));
+	BUG_ON((b->will_make_reachable != 0) != !b->written);
+
+	BUG_ON(b->written >= btree_sectors(c));
+	BUG_ON(b->written & (block_sectors(c) - 1));
+	BUG_ON(bset_written(b, btree_bset_last(b)));
+	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
+	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
+
+	bch2_sort_whiteouts(c, b);
+
+	sort_iter_stack_init(&sort_iter, b);
+
+	bytes = !b->written
+		? sizeof(struct btree_node)
+		: sizeof(struct btree_node_entry);
+
+	bytes += b->whiteout_u64s * sizeof(u64);
+
+	for_each_bset(b, t) {
+		i = bset(b, t);
+
+		if (bset_written(b, i))
+			continue;
+
+		bytes += le16_to_cpu(i->u64s) * sizeof(u64);
+		sort_iter_add(&sort_iter.iter,
+			      btree_bkey_first(b, t),
+			      btree_bkey_last(b, t));
+		seq = max(seq, le64_to_cpu(i->journal_seq));
+	}
+
+	BUG_ON(b->written && !seq);
+
+	/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+	bytes += 8;
+
+	/* buffer must be a multiple of the block size */
+	bytes = round_up(bytes, block_bytes(c));
+
+	data = btree_bounce_alloc(c, bytes, &used_mempool);
+
+	if (!b->written) {
+		bn = data;
+		*bn = *b->data;
+		i = &bn->keys;
+	} else {
+		bne = data;
+		bne->keys = b->data->keys;
+		i = &bne->keys;
+	}
+
+	i->journal_seq	= cpu_to_le64(seq);
+	i->u64s		= 0;
+
+	sort_iter_add(&sort_iter.iter,
+		      unwritten_whiteouts_start(c, b),
+		      unwritten_whiteouts_end(c, b));
+	SET_BSET_SEPARATE_WHITEOUTS(i, false);
+
+	b->whiteout_u64s = 0;
+
+	u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
+	le16_add_cpu(&i->u64s, u64s);
+
+	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
+
+	set_needs_whiteout(i, false);
+
+	/* do we have data to write? */
+	if (b->written && !i->u64s)
+		goto nowrite;
+
+	bytes_to_write = vstruct_end(i) - data;
+	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+	if (!b->written &&
+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+
+	memset(data + bytes_to_write, 0,
+	       (sectors_to_write << 9) - bytes_to_write);
+
+	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
+	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
+	BUG_ON(i->seq != b->data->keys.seq);
+
+	i->version = cpu_to_le16(c->sb.version);
+	SET_BSET_OFFSET(i, b->written);
+	SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
+		validate_before_checksum = true;
+
+	/* validate_bset will be modifying: */
+	if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
+	/* if we're going to be encrypting, check metadata validity first: */
+	if (validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	ret = bset_encrypt(c, i, b->written << 9);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error encrypting btree node: %i\n", ret))
+		goto err;
+
+	nonce = btree_nonce(i, b->written << 9);
+
+	if (bn)
+		bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+	else
+		bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+	/* if we're not encrypting, check metadata after checksumming: */
+	if (!validate_before_checksum &&
+	    validate_bset_for_write(c, b, i, sectors_to_write))
+		goto err;
+
+	/*
+	 * We handle btree write errors by immediately halting the journal -
+	 * after we've done that, we can't issue any subsequent btree writes
+	 * because they might have pointers to new nodes that failed to write.
+	 *
+	 * Furthermore, there's no point in doing any more btree writes because
+	 * with the journal stopped, we're never going to update the journal to
+	 * reflect that those writes were done and the data flushed from the
+	 * journal:
+	 *
+	 * Also on journal error, the pending write may have updates that were
+	 * never journalled (interior nodes, see btree_update_nodes_written()) -
+	 * it's critical that we don't do the write in that case otherwise we
+	 * will have updates visible that weren't in the journal:
+	 *
+	 * Make sure to update b->written so bch2_btree_init_next() doesn't
+	 * break:
+	 */
+	if (bch2_journal_error(&c->journal) ||
+	    c->opts.nochanges)
+		goto err;
+
+	trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
+
+	wbio = container_of(bio_alloc_bioset(NULL,
+				buf_pages(data, sectors_to_write << 9),
+				REQ_OP_WRITE|REQ_META,
+				GFP_NOFS,
+				&c->btree_bio),
+			    struct btree_write_bio, wbio.bio);
+	wbio_init(&wbio->wbio.bio);
+	wbio->data			= data;
+	wbio->data_bytes		= bytes;
+	wbio->sector_offset		= b->written;
+	wbio->wbio.c			= c;
+	wbio->wbio.used_mempool		= used_mempool;
+	wbio->wbio.first_btree_write	= !b->written;
+	wbio->wbio.bio.bi_end_io	= btree_node_write_endio;
+	wbio->wbio.bio.bi_private	= b;
+
+	bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
+
+	bkey_copy(&wbio->key, &b->key);
+
+	b->written += sectors_to_write;
+
+	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
+			cpu_to_le16(b->written);
+
+	atomic64_inc(&c->btree_write_stats[type].nr);
+	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+
+	INIT_WORK(&wbio->work, btree_write_submit);
+	queue_work(c->io_complete_wq, &wbio->work);
+	return;
+err:
+	set_btree_node_noevict(b);
+	b->written += sectors_to_write;
+nowrite:
+	btree_bounce_free(c, bytes, used_mempool, data);
+	__btree_node_write_done(c, b);
+}
+
+/*
+ * Work that must be done with write lock held:
+ */
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
+{
+	bool invalidated_iter = false;
+	struct btree_node_entry *bne;
+	struct bset_tree *t;
+
+	if (!btree_node_just_written(b))
+		return false;
+
+	BUG_ON(b->whiteout_u64s);
+
+	clear_btree_node_just_written(b);
+
+	/*
+	 * Note: immediately after write, bset_written() doesn't work - the
+	 * amount of data we had to write after compaction might have been
+	 * smaller than the offset of the last bset.
+	 *
+	 * However, we know that all bsets have been written here, as long as
+	 * we're still holding the write lock:
+	 */
+
+	/*
+	 * XXX: decide if we really want to unconditionally sort down to a
+	 * single bset:
+	 */
+	if (b->nsets > 1) {
+		btree_node_sort(c, b, 0, b->nsets, true);
+		invalidated_iter = true;
+	} else {
+		invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
+	}
+
+	for_each_bset(b, t)
+		set_needs_whiteout(bset(b, t), true);
+
+	bch2_btree_verify(c, b);
+
+	/*
+	 * If later we don't unconditionally sort down to a single bset, we have
+	 * to ensure this is still true:
+	 */
+	BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
+
+	bne = want_new_bset(c, b);
+	if (bne)
+		bch2_bset_init_next(c, b, bne);
+
+	bch2_btree_build_aux_trees(b);
+
+	return invalidated_iter;
+}
+
+/*
+ * Use this one if the node is intent locked:
+ */
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
+			   enum six_lock_type lock_type_held,
+			   unsigned flags)
+{
+	if (lock_type_held == SIX_LOCK_intent ||
+	    (lock_type_held == SIX_LOCK_read &&
+	     six_lock_tryupgrade(&b->c.lock))) {
+		__bch2_btree_node_write(c, b, flags);
+
+		/* don't cycle lock unnecessarily: */
+		if (btree_node_just_written(b) &&
+		    six_trylock_write(&b->c.lock)) {
+			bch2_btree_post_write_cleanup(c, b);
+			six_unlock_write(&b->c.lock);
+		}
+
+		if (lock_type_held == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
+	} else {
+		__bch2_btree_node_write(c, b, flags);
+		if (lock_type_held == SIX_LOCK_write &&
+		    btree_node_just_written(b))
+			bch2_btree_post_write_cleanup(c, b);
+	}
+}
+
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct btree *b;
+	unsigned i;
+	bool ret = false;
+restart:
+	rcu_read_lock();
+	for_each_cached_btree(b, c, tbl, i, pos)
+		if (test_bit(flag, &b->flags)) {
+			rcu_read_unlock();
+			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+			ret = true;
+			goto restart;
+		}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+	return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
+static const char * const bch2_btree_write_types[] = {
+#define x(t, n) [n] = #t,
+	BCH_BTREE_WRITE_TYPES()
+	NULL
+};
+
+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	printbuf_tabstop_push(out, 20);
+	printbuf_tabstop_push(out, 10);
+
+	prt_tab(out);
+	prt_str(out, "nr");
+	prt_tab(out);
+	prt_str(out, "size");
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
+		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
+		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
+
+		prt_printf(out, "%s:", bch2_btree_write_types[i]);
+		prt_tab(out);
+		prt_u64(out, nr);
+		prt_tab(out);
+		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
+		prt_newline(out);
+	}
+}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
new file mode 100644
index 000000000000..e0d7fa5b1dfb
--- /dev/null
+++ b/fs/bcachefs/btree_io.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bkey_methods.h"
+#include "bset.h"
+#include "btree_locking.h"
+#include "checksum.h"
+#include "extents.h"
+#include "io_write_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+struct btree_node_read_all;
+
+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
+{
+	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+		atomic_dec(&c->btree_cache.dirty);
+}
+
+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+{
+	return k->k.type == KEY_TYPE_btree_ptr_v2
+		? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+		: 0;
+}
+
+struct btree_read_bio {
+	struct bch_fs		*c;
+	struct btree		*b;
+	struct btree_node_read_all *ra;
+	u64			start_time;
+	unsigned		have_ioref:1;
+	unsigned		idx:7;
+	struct extent_ptr_decoded	pick;
+	struct work_struct	work;
+	struct bio		bio;
+};
+
+struct btree_write_bio {
+	struct work_struct	work;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	void			*data;
+	unsigned		data_bytes;
+	unsigned		sector_offset;
+	struct bch_write_bio	wbio;
+};
+
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_ALL,
+};
+
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
+			    enum compact_mode);
+
+static inline bool should_compact_bset_lazy(struct btree *b,
+					    struct bset_tree *t)
+{
+	unsigned total_u64s = bset_u64s(t);
+	unsigned dead_u64s = bset_dead_u64s(b, t);
+
+	return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (should_compact_bset_lazy(b, t))
+			return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+	return false;
+}
+
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
+{
+	return (struct nonce) {{
+		[0] = cpu_to_le32(offset),
+		[1] = ((__le32 *) &i->seq)[0],
+		[2] = ((__le32 *) &i->seq)[1],
+		[3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+	}};
+}
+
+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
+{
+	struct nonce nonce = btree_nonce(i, offset);
+	int ret;
+
+	if (!offset) {
+		struct btree_node *bn = container_of(i, struct btree_node, keys);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+				   &bn->flags, bytes);
+		if (ret)
+			return ret;
+
+		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
+	}
+
+	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+			    vstruct_end(i) - (void *) i->_data);
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+			      struct btree *, bool, bool *);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
+
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+enum btree_write_flags {
+	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
+	__BTREE_WRITE_ALREADY_STARTED,
+};
+#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			   enum six_lock_type, unsigned);
+
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+					    enum six_lock_type lock_held)
+{
+	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
+}
+
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
+
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
+				  unsigned version, unsigned big_endian,
+				  int write, struct bkey_format *f)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes) {
+		swap(f->bits_per_field[BKEY_FIELD_INODE],
+		     f->bits_per_field[BKEY_FIELD_OFFSET]);
+		swap(f->field_offset[BKEY_FIELD_INODE],
+		     f->field_offset[BKEY_FIELD_OFFSET]);
+	}
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    (level || btree_type_has_snapshots(btree_id))) {
+		u64 max_packed =
+			~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+		f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+			? 0
+			: cpu_to_le64(U32_MAX - max_packed);
+	}
+}
+
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
+			       unsigned version, unsigned big_endian,
+			       int write, struct bpos *p)
+{
+	if (big_endian != CPU_BIG_ENDIAN)
+		bch2_bpos_swab(p);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id == BTREE_ID_inodes)
+		swap(p->inode, p->offset);
+}
+
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
+				     unsigned version, unsigned big_endian,
+				     int write,
+				     struct btree_node *bn)
+{
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    write)
+		bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    write)
+		bn->max_key.snapshot = 0;
+
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
+	compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+
+	if (version < bcachefs_metadata_version_snapshot &&
+	    !write)
+		bn->max_key.snapshot = U32_MAX;
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bpos_eq(bn->min_key, POS_MIN) &&
+	    !write)
+		bn->min_key = bpos_nosnap_successor(bn->min_key);
+}
+
+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_IO_H */
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
new file mode 100644
index 000000000000..da594e006769
--- /dev/null
+++ b/fs/bcachefs/btree_iter.c
@@ -0,0 +1,3261 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "trace.h"
+
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+				       struct btree_path *);
+
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return iter->ip_allocated;
+#else
+	return 0;
+#endif
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+				   enum btree_id	r_btree_id,
+				   bool			r_cached,
+				   struct bpos		r_pos,
+				   unsigned		r_level)
+{
+	/*
+	 * Must match lock ordering as defined by __bch2_btree_node_lock:
+	 */
+	return   cmp_int(l->btree_id,	r_btree_id) ?:
+		 cmp_int((int) l->cached,	(int) r_cached) ?:
+		 bpos_cmp(l->pos,	r_pos) ?:
+		-cmp_int(l->level,	r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+				 const struct btree_path *r)
+{
+	return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_successor(p);
+	} else {
+		p = bpos_nosnap_successor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+	/* Are we iterating over keys in all snapshots? */
+	if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+		p = bpos_predecessor(p);
+	} else {
+		p = bpos_nosnap_predecessor(p);
+		p.snapshot = iter->snapshot;
+	}
+
+	return p;
+}
+
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
+{
+	struct bpos pos = iter->pos;
+
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    !bkey_eq(pos, POS_MAX))
+		pos = bkey_successor(iter, pos);
+	return pos;
+}
+
+static inline bool btree_path_pos_before_node(struct btree_path *path,
+					      struct btree *b)
+{
+	return bpos_lt(path->pos, b->data->min_key);
+}
+
+static inline bool btree_path_pos_after_node(struct btree_path *path,
+					     struct btree *b)
+{
+	return bpos_gt(path->pos, b->key.k.p);
+}
+
+static inline bool btree_path_pos_in_node(struct btree_path *path,
+					  struct btree *b)
+{
+	return path->btree_id == b->c.btree_id &&
+		!btree_path_pos_before_node(path, b) &&
+		!btree_path_pos_after_node(path, b);
+}
+
+/* Btree iterator: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+					  struct btree_path *path)
+{
+	struct bkey_cached *ck;
+	bool locked = btree_node_locked(path, 0);
+
+	if (!bch2_btree_node_relock(trans, path, 0))
+		return;
+
+	ck = (void *) path->l[0].b;
+	BUG_ON(ck->key.btree_id != path->btree_id ||
+	       !bkey_eq(ck->key.pos, path->pos));
+
+	if (!locked)
+		btree_node_unlock(trans, path, 0);
+}
+
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+				struct btree_path *path, unsigned level)
+{
+	struct btree_path_level *l;
+	struct btree_node_iter tmp;
+	bool locked;
+	struct bkey_packed *p, *k;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	struct printbuf buf3 = PRINTBUF;
+	const char *msg;
+
+	if (!bch2_debug_check_iterators)
+		return;
+
+	l	= &path->l[level];
+	tmp	= l->iter;
+	locked	= btree_node_locked(path, level);
+
+	if (path->cached) {
+		if (!level)
+			bch2_btree_path_verify_cached(trans, path);
+		return;
+	}
+
+	if (!btree_path_node(path, level))
+		return;
+
+	if (!bch2_btree_node_relock_notrace(trans, path, level))
+		return;
+
+	BUG_ON(!btree_path_pos_in_node(path, l->b));
+
+	bch2_btree_node_iter_verify(&l->iter, l->b);
+
+	/*
+	 * For interior nodes, the iterator will have skipped past deleted keys:
+	 */
+	p = level
+		? bch2_btree_node_iter_prev(&tmp, l->b)
+		: bch2_btree_node_iter_prev_all(&tmp, l->b);
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+	if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
+		msg = "before";
+		goto err;
+	}
+
+	if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		msg = "after";
+		goto err;
+	}
+
+	if (!locked)
+		btree_node_unlock(trans, path, level);
+	return;
+err:
+	bch2_bpos_to_text(&buf1, path->pos);
+
+	if (p) {
+		struct bkey uk = bkey_unpack_key(l->b, p);
+
+		bch2_bkey_to_text(&buf2, &uk);
+	} else {
+		prt_printf(&buf2, "(none)");
+	}
+
+	if (k) {
+		struct bkey uk = bkey_unpack_key(l->b, k);
+
+		bch2_bkey_to_text(&buf3, &uk);
+	} else {
+		prt_printf(&buf3, "(none)");
+	}
+
+	panic("path should be %s key at level %u:\n"
+	      "path pos %s\n"
+	      "prev key %s\n"
+	      "cur  key %s\n",
+	      msg, level, buf1.buf, buf2.buf, buf3.buf);
+}
+
+static void bch2_btree_path_verify(struct btree_trans *trans,
+				   struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i;
+
+	EBUG_ON(path->btree_id >= BTREE_ID_NR);
+
+	for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+		if (!path->l[i].b) {
+			BUG_ON(!path->cached &&
+			       bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
+			break;
+		}
+
+		bch2_btree_path_verify_level(trans, path, i);
+	}
+
+	bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+
+	BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+
+	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+	BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       !btree_type_has_snapshot_field(iter->btree_id));
+
+	if (iter->update_path)
+		bch2_btree_path_verify(trans, iter->update_path);
+	bch2_btree_path_verify(trans, iter->path);
+}
+
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+	BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+	       !iter->pos.snapshot);
+
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	       iter->pos.snapshot != iter->snapshot);
+
+	BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+	       bkey_gt(iter->pos, iter->k.p));
+}
+
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_iter copy;
+	struct bkey_s_c prev;
+	int ret = 0;
+
+	if (!bch2_debug_check_iterators)
+		return 0;
+
+	if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+		return 0;
+
+	if (bkey_err(k) || !k.k)
+		return 0;
+
+	BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+					  iter->snapshot,
+					  k.k->p.snapshot));
+
+	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	prev = bch2_btree_iter_prev(&copy);
+	if (!prev.k)
+		goto out;
+
+	ret = bkey_err(prev);
+	if (ret)
+		goto out;
+
+	if (bkey_eq(prev.k->p, k.k->p) &&
+	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+				      prev.k->p.snapshot) > 0) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+		bch2_bkey_to_text(&buf1, k.k);
+		bch2_bkey_to_text(&buf2, prev.k);
+
+		panic("iter snap %u\n"
+		      "k    %s\n"
+		      "prev %s\n",
+		      iter->snapshot,
+		      buf1.buf, buf2.buf);
+	}
+out:
+	bch2_trans_iter_exit(trans, &copy);
+	return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+			    struct bpos pos, bool key_cache)
+{
+	struct btree_path *path;
+	unsigned idx;
+	struct printbuf buf = PRINTBUF;
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, idx) {
+		int cmp = cmp_int(path->btree_id, id) ?:
+			cmp_int(path->cached, key_cache);
+
+		if (cmp > 0)
+			break;
+		if (cmp < 0)
+			continue;
+
+		if (!btree_node_locked(path, 0) ||
+		    !path->should_be_locked)
+			continue;
+
+		if (!key_cache) {
+			if (bkey_ge(pos, path->l[0].b->data->min_key) &&
+			    bkey_le(pos, path->l[0].b->key.k.p))
+				return;
+		} else {
+			if (bkey_eq(pos, path->pos))
+				return;
+		}
+	}
+
+	bch2_dump_trans_paths_updates(trans);
+	bch2_bpos_to_text(&buf, pos);
+
+	panic("not locked: %s %s%s\n",
+	      bch2_btree_id_str(id), buf.buf,
+	      key_cache ? " cached" : "");
+}
+
+#else
+
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+						struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+					  struct btree_path *path) {}
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+
+#endif
+
+/* Btree path: fixups after btree updates */
+
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+					struct btree *b,
+					struct bset_tree *t,
+					struct bkey_packed *k)
+{
+	struct btree_node_iter_set *set;
+
+	btree_node_iter_for_each(iter, set)
+		if (set->end == t->end_offset) {
+			set->k = __btree_node_key_to_offset(b, k);
+			bch2_btree_node_iter_sort(iter, b);
+			return;
+		}
+
+	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
+					       struct btree *b,
+					       struct bkey_packed *where)
+{
+	struct btree_path_level *l = &path->l[b->c.level];
+
+	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+		return;
+
+	if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+}
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *b,
+				      struct bkey_packed *where)
+{
+	struct btree_path *path;
+
+	trans_for_each_path_with_node(trans, b, path) {
+		__bch2_btree_path_fix_key_modified(path, b, where);
+		bch2_btree_path_verify_level(trans, path, b->c.level);
+	}
+}
+
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+				       struct btree *b,
+				       struct btree_node_iter *node_iter,
+				       struct bset_tree *t,
+				       struct bkey_packed *where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	const struct bkey_packed *end = btree_bkey_last(b, t);
+	struct btree_node_iter_set *set;
+	unsigned offset = __btree_node_key_to_offset(b, where);
+	int shift = new_u64s - clobber_u64s;
+	unsigned old_end = t->end_offset - shift;
+	unsigned orig_iter_pos = node_iter->data[0].k;
+	bool iter_current_key_modified =
+		orig_iter_pos >= offset &&
+		orig_iter_pos <= offset + clobber_u64s;
+
+	btree_node_iter_for_each(node_iter, set)
+		if (set->end == old_end)
+			goto found;
+
+	/* didn't find the bset in the iterator - might have to readd it: */
+	if (new_u64s &&
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+		bch2_btree_node_iter_push(node_iter, b, where, end);
+		goto fixup_done;
+	} else {
+		/* Iterator is after key that changed */
+		return;
+	}
+found:
+	set->end = t->end_offset;
+
+	/* Iterator hasn't gotten to the key that changed yet: */
+	if (set->k < offset)
+		return;
+
+	if (new_u64s &&
+	    bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
+		set->k = offset;
+	} else if (set->k < offset + clobber_u64s) {
+		set->k = offset + new_u64s;
+		if (set->k == set->end)
+			bch2_btree_node_iter_set_drop(node_iter, set);
+	} else {
+		/* Iterator is after key that changed */
+		set->k = (int) set->k + shift;
+		return;
+	}
+
+	bch2_btree_node_iter_sort(node_iter, b);
+fixup_done:
+	if (node_iter->data[0].k != orig_iter_pos)
+		iter_current_key_modified = true;
+
+	/*
+	 * When a new key is added, and the node iterator now points to that
+	 * key, the iterator might have skipped past deleted keys that should
+	 * come after the key the iterator now points to. We have to rewind to
+	 * before those deleted keys - otherwise
+	 * bch2_btree_node_iter_prev_all() breaks:
+	 */
+	if (!bch2_btree_node_iter_end(node_iter) &&
+	    iter_current_key_modified &&
+	    b->c.level) {
+		struct bkey_packed *k, *k2, *p;
+
+		k = bch2_btree_node_iter_peek_all(node_iter, b);
+
+		for_each_bset(b, t) {
+			bool set_pos = false;
+
+			if (node_iter->data[0].end == t->end_offset)
+				continue;
+
+			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+			       bkey_iter_cmp(b, k, p) < 0) {
+				k2 = p;
+				set_pos = true;
+			}
+
+			if (set_pos)
+				btree_node_iter_set_set_pos(node_iter,
+							    b, t, k2);
+		}
+	}
+}
+
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+			      struct btree_path *path,
+			      struct btree *b,
+			      struct btree_node_iter *node_iter,
+			      struct bkey_packed *where,
+			      unsigned clobber_u64s,
+			      unsigned new_u64s)
+{
+	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
+	struct btree_path *linked;
+
+	if (node_iter != &path->l[b->c.level].iter) {
+		__bch2_btree_node_iter_fix(path, b, node_iter, t,
+					   where, clobber_u64s, new_u64s);
+
+		if (bch2_debug_check_iterators)
+			bch2_btree_node_iter_verify(node_iter, b);
+	}
+
+	trans_for_each_path_with_node(trans, b, linked) {
+		__bch2_btree_node_iter_fix(linked, b,
+					   &linked->l[b->c.level].iter, t,
+					   where, clobber_u64s, new_u64s);
+		bch2_btree_path_verify_level(trans, linked, b->c.level);
+	}
+}
+
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+						  struct btree_path_level *l,
+						  struct bkey *u,
+						  struct bkey_packed *k)
+{
+	if (unlikely(!k)) {
+		/*
+		 * signal to bch2_btree_iter_peek_slot() that we're currently at
+		 * a hole
+		 */
+		u->type = KEY_TYPE_deleted;
+		return bkey_s_c_null;
+	}
+
+	return bkey_disassemble(l->b, k, u);
+}
+
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+							struct btree_path_level *l,
+							struct bkey *u)
+{
+	return __btree_iter_unpack(c, l, u,
+			bch2_btree_node_iter_peek_all(&l->iter, l->b));
+}
+
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
+{
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+			bch2_btree_node_iter_peek(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->key.k.p;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
+}
+
+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
+						    struct btree_path *path,
+						    struct btree_path_level *l,
+						    struct bkey *u)
+{
+	struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+			bch2_btree_node_iter_prev(&l->iter, l->b));
+
+	path->pos = k.k ? k.k->p : l->b->data->min_key;
+	trans->paths_sorted = false;
+	bch2_btree_path_verify_level(trans, path, l - path->l);
+	return k;
+}
+
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+					     struct btree_path_level *l,
+					     int max_advance)
+{
+	struct bkey_packed *k;
+	int nr_advanced = 0;
+
+	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
+	       bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
+		if (max_advance > 0 && nr_advanced >= max_advance)
+			return false;
+
+		bch2_btree_node_iter_advance(&l->iter, l->b);
+		nr_advanced++;
+	}
+
+	return true;
+}
+
+static inline void __btree_path_level_init(struct btree_path *path,
+					   unsigned level)
+{
+	struct btree_path_level *l = &path->l[level];
+
+	bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+	/*
+	 * Iterators to interior nodes should always be pointed at the first non
+	 * whiteout:
+	 */
+	if (level)
+		bch2_btree_node_iter_peek(&l->iter, l->b);
+}
+
+void bch2_btree_path_level_init(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
+{
+	BUG_ON(path->cached);
+
+	EBUG_ON(!btree_path_pos_in_node(path, b));
+
+	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+	path->l[b->c.level].b = b;
+	__btree_path_level_init(path, b->c.level);
+}
+
+/* Btree path: fixups after btree node updates: */
+
+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		if (!i->cached &&
+		    i->level	== b->c.level &&
+		    i->btree_id	== b->c.btree_id &&
+		    bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
+		    bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
+			i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+
+			if (unlikely(trans->journal_replay_not_finished)) {
+				struct bkey_i *j_k =
+					bch2_journal_keys_peek_slot(c, i->btree_id, i->level,
+								    i->k->k.p);
+
+				if (j_k) {
+					i->old_k = j_k->k;
+					i->old_v = &j_k->v;
+				}
+			}
+		}
+}
+
+/*
+ * A btree node is being replaced - update the iterator to point to the new
+ * node:
+ */
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->uptodate == BTREE_ITER_UPTODATE &&
+		    !path->cached &&
+		    btree_path_pos_in_node(path, b)) {
+			enum btree_node_locked_type t =
+				btree_lock_want(path, b->c.level);
+
+			if (t != BTREE_NODE_UNLOCKED) {
+				btree_node_unlock(trans, path, b->c.level);
+				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
+				mark_btree_node_locked(trans, path, b->c.level, t);
+			}
+
+			bch2_btree_path_level_init(trans, path, b);
+		}
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/*
+ * A btree node has been modified in such a way as to invalidate iterators - fix
+ * them:
+ */
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
+{
+	struct btree_path *path;
+
+	trans_for_each_path_with_node(trans, b, path)
+		__btree_path_level_init(path, b->c.level);
+
+	bch2_trans_revalidate_updates_in_node(trans, b);
+}
+
+/* Btree path: traverse, set_pos: */
+
+static inline int btree_path_lock_root(struct btree_trans *trans,
+				       struct btree_path *path,
+				       unsigned depth_want,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+	enum six_lock_type lock_type;
+	unsigned i;
+	int ret;
+
+	EBUG_ON(path->nodes_locked);
+
+	while (1) {
+		b = READ_ONCE(*rootp);
+		path->level = READ_ONCE(b->c.level);
+
+		if (unlikely(path->level < depth_want)) {
+			/*
+			 * the root is at a lower depth than the depth we want:
+			 * got to the end of the btree, or we're walking nodes
+			 * greater than some depth and there are no nodes >=
+			 * that depth
+			 */
+			path->level = depth_want;
+			for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+			return 1;
+		}
+
+		lock_type = __btree_lock_want(path, path->level);
+		ret = btree_node_lock(trans, path, &b->c,
+				      path->level, lock_type, trace_ip);
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				return ret;
+			BUG();
+		}
+
+		if (likely(b == READ_ONCE(*rootp) &&
+			   b->c.level == path->level &&
+			   !race_fault())) {
+			for (i = 0; i < path->level; i++)
+				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
+			path->l[path->level].b = b;
+			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+				path->l[i].b = NULL;
+
+			mark_btree_node_locked(trans, path, path->level,
+					       (enum btree_node_locked_type) lock_type);
+			bch2_btree_path_level_init(trans, path, b);
+			return 0;
+		}
+
+		six_unlock_type(&b->c.lock, lock_type);
+	}
+}
+
+noinline
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_node_iter node_iter = l->iter;
+	struct bkey_packed *k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
+
+		bch2_btree_node_iter_advance(&node_iter, l->b);
+		k = bch2_btree_node_iter_peek(&node_iter, l->b);
+		if (!k)
+			break;
+
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(trans, path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_and_journal_iter *jiter)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+		? (path->level > 1 ? 0 :  2)
+		: (path->level > 1 ? 1 : 16);
+	bool was_locked = btree_node_locked(path, path->level);
+	int ret = 0;
+
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr-- && !ret) {
+		if (!bch2_btree_node_relock(trans, path, path->level))
+			break;
+
+		bch2_btree_and_journal_iter_advance(jiter);
+		k = bch2_btree_and_journal_iter_peek(jiter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id,
+					       path->level - 1);
+	}
+
+	if (!was_locked)
+		btree_node_unlock(trans, path, path->level);
+
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+					    struct btree_path *path,
+					    unsigned plevel, struct btree *b)
+{
+	struct btree_path_level *l = &path->l[plevel];
+	bool locked = btree_node_locked(path, plevel);
+	struct bkey_packed *k;
+	struct bch_btree_ptr_v2 *bp;
+
+	if (!bch2_btree_node_relock(trans, path, plevel))
+		return;
+
+	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
+
+	bp = (void *) bkeyp_val(&l->b->format, k);
+	bp->mem_ptr = (unsigned long)b;
+
+	if (!locked)
+		btree_node_unlock(trans, path, plevel);
+}
+
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+						     struct btree_path *path,
+						     unsigned flags,
+						     struct bkey_buf *out)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree_and_journal_iter jiter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+	k = bch2_btree_and_journal_iter_peek(&jiter);
+
+	bch2_bkey_buf_reassemble(out, c, k);
+
+	if (flags & BTREE_ITER_PREFETCH)
+		ret = btree_path_prefetch_j(trans, path, &jiter);
+
+	bch2_btree_and_journal_iter_exit(&jiter);
+	return ret;
+}
+
+static __always_inline int btree_path_down(struct btree_trans *trans,
+					   struct btree_path *path,
+					   unsigned flags,
+					   unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path_level *l = path_l(path);
+	struct btree *b;
+	unsigned level = path->level - 1;
+	enum six_lock_type lock_type = __btree_lock_want(path, level);
+	struct bkey_buf tmp;
+	int ret;
+
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	bch2_bkey_buf_init(&tmp);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+		if (ret)
+			goto err;
+	} else {
+		bch2_bkey_buf_unpack(&tmp, c, l->b,
+				 bch2_btree_node_iter_peek(&l->iter, l->b));
+
+		if (flags & BTREE_ITER_PREFETCH) {
+			ret = btree_path_prefetch(trans, path);
+			if (ret)
+				goto err;
+		}
+	}
+
+	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (unlikely(ret))
+		goto err;
+
+	if (likely(!trans->journal_replay_not_finished &&
+		   tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
+	    unlikely(b != btree_node_mem_ptr(tmp.k)))
+		btree_node_mem_ptr_set(trans, path, level + 1, b);
+
+	if (btree_node_read_locked(path, level + 1))
+		btree_node_unlock(trans, path, level + 1);
+
+	mark_btree_node_locked(trans, path, level,
+			       (enum btree_node_locked_type) lock_type);
+	path->level = level;
+	bch2_btree_path_level_init(trans, path, b);
+
+	bch2_btree_path_verify_locks(path);
+err:
+	bch2_bkey_buf_exit(&tmp, c);
+	return ret;
+}
+
+
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	unsigned long trace_ip = _RET_IP_;
+	int i, ret = 0;
+
+	if (trans->in_traverse_all)
+		return -BCH_ERR_transaction_restart_in_traverse_all;
+
+	trans->in_traverse_all = true;
+retry_all:
+	trans->restarted = 0;
+	trans->last_restarted_ip = 0;
+
+	trans_for_each_path(trans, path)
+		path->should_be_locked = false;
+
+	btree_trans_sort_paths(trans);
+
+	bch2_trans_unlock(trans);
+	cond_resched();
+
+	if (unlikely(trans->memory_allocation_failure)) {
+		struct closure cl;
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+			closure_sync(&cl);
+		} while (ret);
+	}
+
+	/* Now, redo traversals in correct order: */
+	i = 0;
+	while (i < trans->nr_sorted) {
+		path = trans->paths + trans->sorted[i];
+
+		/*
+		 * Traversing a path can cause another path to be added at about
+		 * the same position:
+		 */
+		if (path->uptodate) {
+			__btree_path_get(path, false);
+			ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+			__btree_path_put(path, false);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret, ENOMEM))
+				goto retry_all;
+			if (ret)
+				goto err;
+		} else {
+			i++;
+		}
+	}
+
+	/*
+	 * We used to assert that all paths had been traversed here
+	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
+	 * path->should_be_locked is not set yet, we might have unlocked and
+	 * then failed to relock a path - that's fine.
+	 */
+err:
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	trans->in_traverse_all = false;
+
+	trace_and_count(c, trans_traverse_all, trans, trace_ip);
+	return ret;
+}
+
+static inline bool btree_path_check_pos_in_node(struct btree_path *path,
+						unsigned l, int check_pos)
+{
+	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
+		return false;
+	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
+		return false;
+	return true;
+}
+
+static inline bool btree_path_good_node(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned l, int check_pos)
+{
+	return is_btree_node(path, l) &&
+		bch2_btree_node_relock(trans, path, l) &&
+		btree_path_check_pos_in_node(path, l, check_pos);
+}
+
+static void btree_path_set_level_down(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned new_level)
+{
+	unsigned l;
+
+	path->level = new_level;
+
+	for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+		if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(trans, path, l);
+
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	bch2_btree_path_verify(trans, path);
+}
+
+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
+							 struct btree_path *path,
+							 int check_pos)
+{
+	unsigned i, l = path->level;
+again:
+	while (btree_path_node(path, l) &&
+	       !btree_path_good_node(trans, path, l, check_pos))
+		__btree_path_set_level_up(trans, path, l++);
+
+	/* If we need intent locks, take them too: */
+	for (i = l + 1;
+	     i < path->locks_want && btree_path_node(path, i);
+	     i++)
+		if (!bch2_btree_node_relock(trans, path, i)) {
+			while (l <= i)
+				__btree_path_set_level_up(trans, path, l++);
+			goto again;
+		}
+
+	return l;
+}
+
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+						     struct btree_path *path,
+						     int check_pos)
+{
+	return likely(btree_node_locked(path, path->level) &&
+		      btree_path_check_pos_in_node(path, path->level, check_pos))
+		? path->level
+		: __btree_path_up_until_good_node(trans, path, check_pos);
+}
+
+/*
+ * This is the main state machine for walking down the btree - walks down to a
+ * specified depth
+ *
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
+ *
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
+ * stashed in the iterator and returned from bch2_trans_exit().
+ */
+int bch2_btree_path_traverse_one(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned flags,
+				 unsigned long trace_ip)
+{
+	unsigned depth_want = path->level;
+	int ret = -((int) trans->restarted);
+
+	if (unlikely(ret))
+		goto out;
+
+	if (unlikely(!trans->srcu_held))
+		bch2_trans_srcu_lock(trans);
+
+	/*
+	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+	 * and re-traverse the path without a transaction restart:
+	 */
+	if (path->should_be_locked) {
+		ret = bch2_btree_path_relock(trans, path, trace_ip);
+		goto out;
+	}
+
+	if (path->cached) {
+		ret = bch2_btree_path_traverse_cached(trans, path, flags);
+		goto out;
+	}
+
+	if (unlikely(path->level >= BTREE_MAX_DEPTH))
+		goto out;
+
+	path->level = btree_path_up_until_good_node(trans, path, 0);
+
+	EBUG_ON(btree_path_node(path, path->level) &&
+		!btree_node_locked(path, path->level));
+
+	/*
+	 * Note: path->nodes[path->level] may be temporarily NULL here - that
+	 * would indicate to other code that we got to the end of the btree,
+	 * here it indicates that relocking the root failed - it's critical that
+	 * btree_path_lock_root() comes next and that it can't fail
+	 */
+	while (path->level > depth_want) {
+		ret = btree_path_node(path, path->level)
+			? btree_path_down(trans, path, flags, trace_ip)
+			: btree_path_lock_root(trans, path, depth_want, trace_ip);
+		if (unlikely(ret)) {
+			if (ret == 1) {
+				/*
+				 * No nodes at this level - got to the end of
+				 * the btree:
+				 */
+				ret = 0;
+				goto out;
+			}
+
+			__bch2_btree_path_unlock(trans, path);
+			path->level = depth_want;
+			path->l[path->level].b = ERR_PTR(ret);
+			goto out;
+		}
+	}
+
+	path->uptodate = BTREE_ITER_UPTODATE;
+out:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
+		panic("ret %s (%i) trans->restarted %s (%i)\n",
+		      bch2_err_str(ret), ret,
+		      bch2_err_str(trans->restarted), trans->restarted);
+	bch2_btree_path_verify(trans, path);
+	return ret;
+}
+
+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+			    struct btree_path *src)
+{
+	unsigned i, offset = offsetof(struct btree_path, pos);
+
+	memcpy((void *) dst + offset,
+	       (void *) src + offset,
+	       sizeof(struct btree_path) - offset);
+
+	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+		unsigned t = btree_node_locked_type(dst, i);
+
+		if (t != BTREE_NODE_UNLOCKED)
+			six_lock_increment(&dst->l[i].b->c.lock, t);
+	}
+}
+
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+					   bool intent)
+{
+	struct btree_path *new = btree_path_alloc(trans, src);
+
+	btree_path_copy(trans, new, src);
+	__btree_path_get(new, intent);
+	return new;
+}
+
+__flatten
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
+{
+	__btree_path_put(path, intent);
+	path = btree_path_clone(trans, path, intent);
+	path->preserve = false;
+	return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent, unsigned long ip, int cmp)
+{
+	unsigned level = path->level;
+
+	bch2_trans_verify_not_in_restart(trans);
+	EBUG_ON(!path->ref);
+
+	path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+	path->pos		= new_pos;
+	trans->paths_sorted	= false;
+
+	if (unlikely(path->cached)) {
+		btree_node_unlock(trans, path, 0);
+		path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		goto out;
+	}
+
+	level = btree_path_up_until_good_node(trans, path, cmp);
+
+	if (btree_path_node(path, level)) {
+		struct btree_path_level *l = &path->l[level];
+
+		BUG_ON(!btree_node_locked(path, level));
+		/*
+		 * We might have to skip over many keys, or just a few: try
+		 * advancing the node iterator, and if we have to skip over too
+		 * many keys just reinit it (or if we're rewinding, since that
+		 * is expensive).
+		 */
+		if (cmp < 0 ||
+		    !btree_path_advance_to_pos(path, l, 8))
+			bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+		/*
+		 * Iterators to interior nodes should always be pointed at the first non
+		 * whiteout:
+		 */
+		if (unlikely(level))
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+	}
+
+	if (unlikely(level != path->level)) {
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		__bch2_btree_path_unlock(trans, path);
+	}
+out:
+	bch2_btree_path_verify(trans, path);
+	return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && !btree_path_cmp(sib, path))
+		return sib;
+
+	return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+	struct btree_path *sib;
+
+	sib = prev_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	sib = next_btree_path(trans, path);
+	if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b)
+		return sib;
+
+	return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+	__bch2_btree_path_unlock(trans, path);
+	btree_path_list_remove(trans, path);
+	trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+	struct btree_path *dup;
+
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	dup = path->preserve
+		? have_path_at_pos(trans, path)
+		: have_node_at_pos(trans, path);
+
+	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+		return;
+
+	if (path->should_be_locked &&
+	    !trans->restarted &&
+	    (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+		return;
+
+	if (dup) {
+		dup->preserve		|= path->preserve;
+		dup->should_be_locked	|= path->should_be_locked;
+	}
+
+	__bch2_path_free(trans, path);
+}
+
+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+				 bool intent)
+{
+	EBUG_ON(trans->paths + path->idx != path);
+	EBUG_ON(!path->ref);
+
+	if (!__btree_path_put(path, intent))
+		return;
+
+	__bch2_path_free(trans, path);
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+{
+	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
+	      trans->restart_count, restart_count,
+	      (void *) trans->last_begin_ip);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+{
+	panic("in transaction restart: %s, last restarted by %pS\n",
+	      bch2_err_str(trans->restarted),
+	      (void *) trans->last_restarted_ip);
+}
+
+noinline __cold
+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+
+	prt_printf(buf, "transaction updates for %s journal seq %llu",
+	       trans->fn, trans->journal_res.seq);
+	prt_newline(buf);
+	printbuf_indent_add(buf, 2);
+
+	trans_for_each_update(trans, i) {
+		struct bkey_s_c old = { &i->old_k, i->old_v };
+
+		prt_printf(buf, "update: btree=%s cached=%u %pS",
+		       bch2_btree_id_str(i->btree_id),
+		       i->cached,
+		       (void *) i->ip_allocated);
+		prt_newline(buf);
+
+		prt_printf(buf, "  old ");
+		bch2_bkey_val_to_text(buf, trans->c, old);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
+		prt_newline(buf);
+	}
+
+	trans_for_each_wb_update(trans, wb) {
+		prt_printf(buf, "update: btree=%s wb=1 %pS",
+		       bch2_btree_id_str(wb->btree),
+		       (void *) i->ip_allocated);
+		prt_newline(buf);
+
+		prt_printf(buf, "  new ");
+		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+		prt_newline(buf);
+	}
+
+	printbuf_indent_sub(buf, 2);
+}
+
+noinline __cold
+void bch2_dump_trans_updates(struct btree_trans *trans)
+{
+	struct printbuf buf = PRINTBUF;
+
+	bch2_trans_updates_to_text(&buf, trans);
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+{
+	prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
+		   path->idx, path->ref, path->intent_ref,
+		   path->preserve ? 'P' : ' ',
+		   path->should_be_locked ? 'S' : ' ',
+		   bch2_btree_id_str(path->btree_id),
+		   path->level);
+	bch2_bpos_to_text(out, path->pos);
+
+	prt_printf(out, " locks %u", path->nodes_locked);
+#ifdef TRACK_PATH_ALLOCATED
+	prt_printf(out, " %pS", (void *) path->ip_allocated);
+#endif
+	prt_newline(out);
+}
+
+static noinline __cold
+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
+				bool nosort)
+{
+	struct btree_path *path;
+	unsigned idx;
+
+	if (!nosort)
+		btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, idx)
+		bch2_btree_path_to_text(out, path);
+}
+
+noinline __cold
+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	__bch2_trans_paths_to_text(out, trans, false);
+}
+
+static noinline __cold
+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
+{
+	struct printbuf buf = PRINTBUF;
+
+	__bch2_trans_paths_to_text(&buf, trans, nosort);
+	bch2_trans_updates_to_text(&buf, trans);
+
+	bch2_print_string_as_lines(KERN_ERR, buf.buf);
+	printbuf_exit(&buf);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+	__bch2_dump_trans_paths_updates(trans, false);
+}
+
+noinline __cold
+static void bch2_trans_update_max_paths(struct btree_trans *trans)
+{
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+	struct printbuf buf = PRINTBUF;
+
+	if (!s)
+		return;
+
+	bch2_trans_paths_to_text(&buf, trans);
+
+	if (!buf.allocation_failure) {
+		mutex_lock(&s->lock);
+		if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
+			s->nr_max_paths = trans->nr_max_paths =
+				hweight64(trans->paths_allocated);
+			swap(s->max_paths_text, buf.buf);
+		}
+		mutex_unlock(&s->lock);
+	}
+
+	printbuf_exit(&buf);
+
+	trans->nr_max_paths = hweight64(trans->paths_allocated);
+}
+
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+	bch2_dump_trans_paths_updates(trans);
+	panic("trans path overflow\n");
+}
+
+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
+						  struct btree_path *pos)
+{
+	struct btree_path *path;
+	unsigned idx;
+
+	if (unlikely(trans->paths_allocated ==
+		     ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+		btree_path_overflow(trans);
+
+	idx = __ffs64(~trans->paths_allocated);
+
+	/*
+	 * Do this before marking the new path as allocated, since it won't be
+	 * initialized yet:
+	 */
+	if (unlikely(idx > trans->nr_max_paths))
+		bch2_trans_update_max_paths(trans);
+
+	trans->paths_allocated |= 1ULL << idx;
+
+	path = &trans->paths[idx];
+	path->idx		= idx;
+	path->ref		= 0;
+	path->intent_ref	= 0;
+	path->nodes_locked	= 0;
+	path->alloc_seq++;
+
+	btree_path_list_add(trans, pos, path);
+	trans->paths_sorted = false;
+	return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans,
+				 enum btree_id btree_id, struct bpos pos,
+				 unsigned locks_want, unsigned level,
+				 unsigned flags, unsigned long ip)
+{
+	struct btree_path *path, *path_pos = NULL;
+	bool cached = flags & BTREE_ITER_CACHED;
+	bool intent = flags & BTREE_ITER_INTENT;
+	int i;
+
+	bch2_trans_verify_not_in_restart(trans);
+	bch2_trans_verify_locks(trans);
+
+	btree_trans_sort_paths(trans);
+
+	trans_for_each_path_inorder(trans, path, i) {
+		if (__btree_path_cmp(path,
+				     btree_id,
+				     cached,
+				     pos,
+				     level) > 0)
+			break;
+
+		path_pos = path;
+	}
+
+	if (path_pos &&
+	    path_pos->cached	== cached &&
+	    path_pos->btree_id	== btree_id &&
+	    path_pos->level	== level) {
+		__btree_path_get(path_pos, intent);
+		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+	} else {
+		path = btree_path_alloc(trans, path_pos);
+		path_pos = NULL;
+
+		__btree_path_get(path, intent);
+		path->pos			= pos;
+		path->btree_id			= btree_id;
+		path->cached			= cached;
+		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
+		path->should_be_locked		= false;
+		path->level			= level;
+		path->locks_want		= locks_want;
+		path->nodes_locked		= 0;
+		for (i = 0; i < ARRAY_SIZE(path->l); i++)
+			path->l[i].b		= ERR_PTR(-BCH_ERR_no_btree_node_init);
+#ifdef TRACK_PATH_ALLOCATED
+		path->ip_allocated		= ip;
+#endif
+		trans->paths_sorted		= false;
+	}
+
+	if (!(flags & BTREE_ITER_NOPRESERVE))
+		path->preserve = true;
+
+	if (path->intent_ref)
+		locks_want = max(locks_want, level + 1);
+
+	/*
+	 * If the path has locks_want greater than requested, we don't downgrade
+	 * it here - on transaction restart because btree node split needs to
+	 * upgrade locks, we might be putting/getting the iterator again.
+	 * Downgrading iterators only happens via bch2_trans_downgrade(), after
+	 * a successful transaction commit.
+	 */
+
+	locks_want = min(locks_want, BTREE_MAX_DEPTH);
+	if (locks_want > path->locks_want)
+		bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
+
+	return path;
+}
+
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+	struct btree_path_level *l = path_l(path);
+	struct bkey_packed *_k;
+	struct bkey_s_c k;
+
+	if (unlikely(!l->b))
+		return bkey_s_c_null;
+
+	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+	EBUG_ON(!btree_node_locked(path, path->level));
+
+	if (!path->cached) {
+		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos));
+
+		if (!k.k || !bpos_eq(path->pos, k.k->p))
+			goto hole;
+	} else {
+		struct bkey_cached *ck = (void *) path->l[0].b;
+
+		EBUG_ON(ck &&
+			(path->btree_id != ck->key.btree_id ||
+			 !bkey_eq(path->pos, ck->key.pos)));
+		if (!ck || !ck->valid)
+			return bkey_s_c_null;
+
+		*u = ck->k->k;
+		k = bkey_i_to_s_c(ck->k);
+	}
+
+	return k;
+hole:
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+	int ret;
+
+	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+					btree_iter_search_key(iter),
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+	if (ret)
+		return ret;
+
+	btree_path_set_should_be_locked(iter->path);
+	return 0;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree *b = NULL;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (ret)
+		goto err;
+
+	b = btree_path_node(iter->path, iter->path->level);
+	if (!b)
+		goto out;
+
+	BUG_ON(bpos_lt(b->key.k.p, iter->pos));
+
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(iter->path);
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
+}
+
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+{
+	struct btree *b;
+
+	while (b = bch2_btree_iter_peek_node(iter),
+	       bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
+		bch2_trans_begin(iter->trans);
+
+	return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct btree_path *path = iter->path;
+	struct btree *b = NULL;
+	int ret;
+
+	bch2_trans_verify_not_in_restart(trans);
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	/* already at end? */
+	if (!btree_path_node(path, path->level))
+		return NULL;
+
+	/* got to end? */
+	if (!btree_path_node(path, path->level + 1)) {
+		btree_path_set_level_up(trans, path);
+		return NULL;
+	}
+
+	if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+		__bch2_btree_path_unlock(trans, path);
+		path->l[path->level].b		= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		path->l[path->level + 1].b	= ERR_PTR(-BCH_ERR_no_btree_node_relock);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+		trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		goto err;
+	}
+
+	b = btree_path_node(path, path->level + 1);
+
+	if (bpos_eq(iter->pos, b->key.k.p)) {
+		__btree_path_set_level_up(trans, path, path->level++);
+	} else {
+		/*
+		 * Haven't gotten to the end of the parent node: go back down to
+		 * the next child node
+		 */
+		path = iter->path =
+			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+					   iter->flags & BTREE_ITER_INTENT,
+					   btree_iter_ip_allocated(iter));
+
+		btree_path_set_level_down(trans, path, iter->min_depth);
+
+		ret = bch2_btree_path_traverse(trans, path, iter->flags);
+		if (ret)
+			goto err;
+
+		b = path->l[path->level].b;
+	}
+
+	bkey_init(&iter->k);
+	iter->k.p = iter->pos = b->key.k.p;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+	btree_path_set_should_be_locked(iter->path);
+	BUG_ON(iter->path->uptodate);
+out:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return b;
+err:
+	b = ERR_PTR(ret);
+	goto out;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+{
+	if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+		struct bpos pos = iter->k.p;
+		bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+			     ? bpos_eq(pos, SPOS_MAX)
+			     : bkey_eq(pos, SPOS_MAX));
+
+		if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+			pos = bkey_successor(iter, pos);
+		bch2_btree_iter_set_pos(iter, pos);
+		return ret;
+	} else {
+		if (!btree_path_node(iter->path, iter->path->level))
+			return true;
+
+		iter->advanced = true;
+		return false;
+	}
+}
+
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+{
+	struct bpos pos = bkey_start_pos(&iter->k);
+	bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+		     ? bpos_eq(pos, POS_MIN)
+		     : bkey_eq(pos, POS_MIN));
+
+	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+		pos = bkey_predecessor(iter, pos);
+	bch2_btree_iter_set_pos(iter, pos);
+	return ret;
+}
+
+static noinline
+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+{
+	struct btree_insert_entry *i;
+	struct bkey_i *ret = NULL;
+
+	trans_for_each_update(iter->trans, i) {
+		if (i->btree_id < iter->btree_id)
+			continue;
+		if (i->btree_id > iter->btree_id)
+			break;
+		if (bpos_lt(i->k->k.p, iter->path->pos))
+			continue;
+		if (i->key_cache_already_flushed)
+			continue;
+		if (!ret || bpos_lt(i->k->k.p, ret->k.p))
+			ret = i->k;
+	}
+
+	return ret;
+}
+
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+{
+	return iter->flags & BTREE_ITER_WITH_UPDATES
+		? __bch2_btree_trans_peek_updates(iter)
+		: NULL;
+}
+
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+					      struct btree_iter *iter,
+					      struct bpos end_pos)
+{
+	struct bkey_i *k;
+
+	if (bpos_lt(iter->path->pos, iter->journal_pos))
+		iter->journal_idx = 0;
+
+	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+					iter->path->level,
+					iter->path->pos,
+					end_pos,
+					&iter->journal_idx);
+
+	iter->journal_pos = k ? k->k.p : end_pos;
+	return k;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+					      struct btree_iter *iter)
+{
+	struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+
+	if (k) {
+		iter->k = k->k;
+		return bkey_i_to_s_c(k);
+	} else {
+		return bkey_s_c_null;
+	}
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bkey_s_c k)
+{
+	struct bkey_i *next_journal =
+		bch2_btree_journal_peek(trans, iter,
+				k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+
+	if (next_journal) {
+		iter->k = next_journal->k;
+		k = bkey_i_to_s_c(next_journal);
+	}
+
+	return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k;
+	int ret;
+
+	if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) &&
+	    bpos_eq(iter->pos, pos))
+		return bkey_s_c_null;
+
+	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+		return bkey_s_c_null;
+
+	if (!iter->key_cache_path)
+		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+						     iter->flags & BTREE_ITER_INTENT, 0,
+						     iter->flags|BTREE_ITER_CACHED|
+						     BTREE_ITER_CACHED_NOFILL,
+						     _THIS_IP_);
+
+	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret =   bch2_btree_path_traverse(trans, iter->key_cache_path,
+					 iter->flags|BTREE_ITER_CACHED) ?:
+		bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	btree_path_set_should_be_locked(iter->key_cache_path);
+
+	k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+	if (k.k && !bkey_err(k)) {
+		iter->k = u;
+		k.k = &iter->k;
+	}
+	return k;
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_i *next_update;
+	struct bkey_s_c k, k2;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+
+	while (1) {
+		struct btree_path_level *l;
+
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out;
+		}
+
+		l = path_l(iter->path);
+
+		if (unlikely(!l->b)) {
+			/* No btree nodes at requested level: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+
+		btree_path_set_should_be_locked(iter->path);
+
+		k = btree_path_level_peek_all(trans->c, l, &iter->k);
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    k.k &&
+		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+			k = k2;
+			ret = bkey_err(k);
+			if (ret) {
+				bch2_btree_iter_set_pos(iter, iter->pos);
+				goto out;
+			}
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+			k = btree_trans_peek_journal(trans, iter, k);
+
+		next_update = btree_trans_peek_updates(iter);
+
+		if (next_update &&
+		    bpos_le(next_update->k.p,
+			    k.k ? k.k->p : l->b->key.k.p)) {
+			iter->k = next_update->k;
+			k = bkey_i_to_s_c(next_update);
+		}
+
+		if (k.k && bkey_deleted(k.k)) {
+			/*
+			 * If we've got a whiteout, and it's after the search
+			 * key, advance the search key to the whiteout instead
+			 * of just after the whiteout - it might be a btree
+			 * whiteout, with a real key at the same position, since
+			 * in the btree deleted keys sort before non deleted.
+			 */
+			search_key = !bpos_eq(search_key, k.k->p)
+				? k.k->p
+				: bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (likely(k.k)) {
+			break;
+		} else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) {
+			/* Advance to next leaf node: */
+			search_key = bpos_successor(l->b->key.k.p);
+		} else {
+			/* End of btree: */
+			bch2_btree_iter_set_pos(iter, SPOS_MAX);
+			k = bkey_s_c_null;
+			goto out;
+		}
+	}
+out:
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ * @end:	search limit: returns keys less than or equal to @end
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = btree_iter_search_key(iter);
+	struct bkey_s_c k;
+	struct bpos iter_pos;
+	int ret;
+
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+	EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
+
+	if (iter->update_path) {
+		bch2_path_put_nokeep(trans, iter->update_path,
+				     iter->flags & BTREE_ITER_INTENT);
+		iter->update_path = NULL;
+	}
+
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	while (1) {
+		k = __bch2_btree_iter_peek(iter, search_key);
+		if (unlikely(!k.k))
+			goto end;
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
+
+		/*
+		 * We need to check against @end before FILTER_SNAPSHOTS because
+		 * if we get to a different inode that requested we might be
+		 * seeing keys for a different snapshot tree that will all be
+		 * filtered out.
+		 *
+		 * But we can't do the full check here, because bkey_start_pos()
+		 * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+		 * that's what we check against in extents mode:
+		 */
+		if (k.k->p.inode > end.inode)
+			goto end;
+
+		if (iter->update_path &&
+		    !bkey_eq(iter->update_path->pos, k.k->p)) {
+			bch2_path_put_nokeep(trans, iter->update_path,
+					     iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = NULL;
+		}
+
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    (iter->flags & BTREE_ITER_INTENT) &&
+		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+		    !iter->update_path) {
+			struct bpos pos = k.k->p;
+
+			if (pos.snapshot < iter->snapshot) {
+				search_key = bpos_successor(k.k->p);
+				continue;
+			}
+
+			pos.snapshot = iter->snapshot;
+
+			/*
+			 * advance, same as on exit for iter->path, but only up
+			 * to snapshot
+			 */
+			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+			iter->update_path = iter->path;
+
+			iter->update_path = bch2_btree_path_set_pos(trans,
+						iter->update_path, pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+			ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+			if (unlikely(ret)) {
+				k = bkey_s_c_err(ret);
+				goto out_no_locked;
+			}
+		}
+
+		/*
+		 * We can never have a key in a leaf node at POS_MAX, so
+		 * we don't have to check these successor() calls:
+		 */
+		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+		    !bch2_snapshot_is_ancestor(trans->c,
+					       iter->snapshot,
+					       k.k->p.snapshot)) {
+			search_key = bpos_successor(k.k->p);
+			continue;
+		}
+
+		if (bkey_whiteout(k.k) &&
+		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+			search_key = bkey_successor(iter, k.k->p);
+			continue;
+		}
+
+		/*
+		 * iter->pos should be mononotically increasing, and always be
+		 * equal to the key we just returned - except extents can
+		 * straddle iter->pos:
+		 */
+		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
+			iter_pos = k.k->p;
+		else
+			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
+
+		if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+			     ? bkey_gt(iter_pos, end)
+			     : bkey_ge(iter_pos, end)))
+			goto end;
+
+		break;
+	}
+
+	iter->pos = iter_pos;
+
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+				iter->flags & BTREE_ITER_INTENT,
+				btree_iter_ip_allocated(iter));
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	if (iter->update_path) {
+		ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+		if (unlikely(ret))
+			k = bkey_s_c_err(ret);
+		else
+			btree_path_set_should_be_locked(iter->update_path);
+	}
+
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		iter->pos.snapshot = iter->snapshot;
+
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret)) {
+		bch2_btree_iter_set_pos(iter, iter->pos);
+		k = bkey_s_c_err(ret);
+	}
+
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	return k;
+end:
+	bch2_btree_iter_set_pos(iter, end);
+	k = bkey_s_c_null;
+	goto out_no_locked;
+}
+
+/**
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bkey_s_c k;
+	int ret;
+
+	EBUG_ON(iter->path->cached);
+	bch2_btree_iter_verify(iter);
+	BUG_ON(iter->path->level < iter->min_depth);
+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+	EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out_no_locked;
+		}
+
+		/* Already at end? */
+		if (!btree_path_node(iter->path, iter->path->level)) {
+			k = bkey_s_c_null;
+			goto out_no_locked;
+		}
+
+		k = btree_path_level_peek_all(trans->c,
+				&iter->path->l[iter->path->level], &iter->k);
+
+		/* Check if we should go up to the parent node: */
+		if (!k.k ||
+		    (iter->advanced &&
+		     bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
+			iter->pos = path_l(iter->path)->b->key.k.p;
+			btree_path_set_level_up(trans, iter->path);
+			iter->advanced = false;
+			continue;
+		}
+
+		/*
+		 * Check if we should go back down to a leaf:
+		 * If we're not in a leaf node, we only return the current key
+		 * if it exactly matches iter->pos - otherwise we first have to
+		 * go back to the leaf:
+		 */
+		if (iter->path->level != iter->min_depth &&
+		    (iter->advanced ||
+		     !k.k ||
+		     !bpos_eq(iter->pos, k.k->p))) {
+			btree_path_set_level_down(trans, iter->path, iter->min_depth);
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		/* Check if we should go to the next key: */
+		if (iter->path->level == iter->min_depth &&
+		    iter->advanced &&
+		    k.k &&
+		    bpos_eq(iter->pos, k.k->p)) {
+			iter->pos = bpos_successor(iter->pos);
+			iter->advanced = false;
+			continue;
+		}
+
+		if (iter->advanced &&
+		    iter->path->level == iter->min_depth &&
+		    !bpos_eq(k.k->p, iter->pos))
+			iter->advanced = false;
+
+		BUG_ON(iter->advanced);
+		BUG_ON(!k.k);
+		break;
+	}
+
+	iter->pos = k.k->p;
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek(iter);
+}
+
+/**
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
+ * iterator's current position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key = iter->pos;
+	struct btree_path *saved_path = NULL;
+	struct bkey_s_c k;
+	struct bkey saved_k;
+	const struct bch_val *saved_v;
+	int ret;
+
+	EBUG_ON(iter->path->cached || iter->path->level);
+	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+		return bkey_s_c_err(-EIO);
+
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		search_key.snapshot = U32_MAX;
+
+	while (1) {
+		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+						iter->flags & BTREE_ITER_INTENT,
+						btree_iter_ip_allocated(iter));
+
+		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+		if (unlikely(ret)) {
+			/* ensure that iter->k is consistent with iter->pos: */
+			bch2_btree_iter_set_pos(iter, iter->pos);
+			k = bkey_s_c_err(ret);
+			goto out_no_locked;
+		}
+
+		k = btree_path_level_peek(trans, iter->path,
+					  &iter->path->l[0], &iter->k);
+		if (!k.k ||
+		    ((iter->flags & BTREE_ITER_IS_EXTENTS)
+		     ? bpos_ge(bkey_start_pos(k.k), search_key)
+		     : bpos_gt(k.k->p, search_key)))
+			k = btree_path_level_prev(trans, iter->path,
+						  &iter->path->l[0], &iter->k);
+
+		if (likely(k.k)) {
+			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+				if (k.k->p.snapshot == iter->snapshot)
+					goto got_key;
+
+				/*
+				 * If we have a saved candidate, and we're no
+				 * longer at the same _key_ (not pos), return
+				 * that candidate
+				 */
+				if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
+					bch2_path_put_nokeep(trans, iter->path,
+						      iter->flags & BTREE_ITER_INTENT);
+					iter->path = saved_path;
+					saved_path = NULL;
+					iter->k	= saved_k;
+					k.v	= saved_v;
+					goto got_key;
+				}
+
+				if (bch2_snapshot_is_ancestor(iter->trans->c,
+							      iter->snapshot,
+							      k.k->p.snapshot)) {
+					if (saved_path)
+						bch2_path_put_nokeep(trans, saved_path,
+						      iter->flags & BTREE_ITER_INTENT);
+					saved_path = btree_path_clone(trans, iter->path,
+								iter->flags & BTREE_ITER_INTENT);
+					saved_k = *k.k;
+					saved_v = k.v;
+				}
+
+				search_key = bpos_predecessor(k.k->p);
+				continue;
+			}
+got_key:
+			if (bkey_whiteout(k.k) &&
+			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+				search_key = bkey_predecessor(iter, k.k->p);
+				if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+					search_key.snapshot = U32_MAX;
+				continue;
+			}
+
+			break;
+		} else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+			/* Advance to previous leaf node: */
+			search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+		} else {
+			/* Start of btree: */
+			bch2_btree_iter_set_pos(iter, POS_MIN);
+			k = bkey_s_c_null;
+			goto out_no_locked;
+		}
+	}
+
+	EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+
+	/* Extents can straddle iter->pos: */
+	if (bkey_lt(k.k->p, iter->pos))
+		iter->pos = k.k->p;
+
+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+		iter->pos.snapshot = iter->snapshot;
+
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	if (saved_path)
+		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+
+	return k;
+}
+
+/**
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
+ * position
+ * @iter:	iterator to peek from
+ *
+ * Returns:	key if found, or an error extractable with bkey_err().
+ */
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_prev(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+	struct btree_trans *trans = iter->trans;
+	struct bpos search_key;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_verify(iter);
+	bch2_btree_iter_verify_entry_exit(iter);
+	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+	EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+
+	/* extents can't span inode numbers: */
+	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+	    unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
+		if (iter->pos.inode == KEY_INODE_MAX)
+			return bkey_s_c_null;
+
+		bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	}
+
+	search_key = btree_iter_search_key(iter);
+	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+					iter->flags & BTREE_ITER_INTENT,
+					btree_iter_ip_allocated(iter));
+
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (unlikely(ret)) {
+		k = bkey_s_c_err(ret);
+		goto out_no_locked;
+	}
+
+	if ((iter->flags & BTREE_ITER_CACHED) ||
+	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
+		struct bkey_i *next_update;
+
+		if ((next_update = btree_trans_peek_updates(iter)) &&
+		    bpos_eq(next_update->k.p, iter->pos)) {
+			iter->k = next_update->k;
+			k = bkey_i_to_s_c(next_update);
+			goto out;
+		}
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+		    (k = btree_trans_peek_slot_journal(trans, iter)).k)
+			goto out;
+
+		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+			if (!bkey_err(k))
+				iter->k = *k.k;
+			/* We're not returning a key from iter->path: */
+			goto out_no_locked;
+		}
+
+		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+		if (unlikely(!k.k))
+			goto out_no_locked;
+	} else {
+		struct bpos next;
+		struct bpos end = iter->pos;
+
+		if (iter->flags & BTREE_ITER_IS_EXTENTS)
+			end.offset = U64_MAX;
+
+		EBUG_ON(iter->path->level);
+
+		if (iter->flags & BTREE_ITER_INTENT) {
+			struct btree_iter iter2;
+
+			bch2_trans_copy_iter(&iter2, iter);
+			k = bch2_btree_iter_peek_upto(&iter2, end);
+
+			if (k.k && !bkey_err(k)) {
+				iter->k = iter2.k;
+				k.k = &iter->k;
+			}
+			bch2_trans_iter_exit(trans, &iter2);
+		} else {
+			struct bpos pos = iter->pos;
+
+			k = bch2_btree_iter_peek_upto(iter, end);
+			if (unlikely(bkey_err(k)))
+				bch2_btree_iter_set_pos(iter, pos);
+			else
+				iter->pos = pos;
+		}
+
+		if (unlikely(bkey_err(k)))
+			goto out_no_locked;
+
+		next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+
+		if (bkey_lt(iter->pos, next)) {
+			bkey_init(&iter->k);
+			iter->k.p = iter->pos;
+
+			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+				bch2_key_resize(&iter->k,
+						min_t(u64, KEY_SIZE_MAX,
+						      (next.inode == iter->pos.inode
+						       ? next.offset
+						       : KEY_OFFSET_MAX) -
+						      iter->pos.offset));
+				EBUG_ON(!iter->k.size);
+			}
+
+			k = (struct bkey_s_c) { &iter->k, NULL };
+		}
+	}
+out:
+	btree_path_set_should_be_locked(iter->path);
+out_no_locked:
+	bch2_btree_iter_verify_entry_exit(iter);
+	bch2_btree_iter_verify(iter);
+	ret = bch2_btree_iter_verify_ret(iter, k);
+	if (unlikely(ret))
+		return bkey_s_c_err(ret);
+
+	return k;
+}
+
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_advance(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+	if (!bch2_btree_iter_rewind(iter))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(iter->trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, iter->flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(iter->trans);
+
+	return k;
+}
+
+/* new transactional stuff: */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	unsigned i;
+
+	BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+
+	trans_for_each_path(trans, path) {
+		BUG_ON(path->sorted_idx >= trans->nr_sorted);
+		BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+	}
+
+	for (i = 0; i < trans->nr_sorted; i++) {
+		unsigned idx = trans->sorted[i];
+
+		EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+		BUG_ON(trans->paths[idx].sorted_idx != i);
+	}
+}
+
+static void btree_trans_verify_sorted(struct btree_trans *trans)
+{
+	struct btree_path *path, *prev = NULL;
+	unsigned i;
+
+	if (!bch2_debug_check_iterators)
+		return;
+
+	trans_for_each_path_inorder(trans, path, i) {
+		if (prev && btree_path_cmp(prev, path) > 0) {
+			__bch2_dump_trans_paths_updates(trans, true);
+			panic("trans paths out of order!\n");
+		}
+		prev = path;
+	}
+}
+#else
+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {}
+#endif
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *trans)
+{
+	int i, l = 0, r = trans->nr_sorted, inc = 1;
+	bool swapped;
+
+	btree_trans_verify_sorted_refs(trans);
+
+	if (trans->paths_sorted)
+		goto out;
+
+	/*
+	 * Cocktail shaker sort: this is efficient because iterators will be
+	 * mostly sorted.
+	 */
+	do {
+		swapped = false;
+
+		for (i = inc > 0 ? l : r - 2;
+		     i + 1 < r && i >= l;
+		     i += inc) {
+			if (btree_path_cmp(trans->paths + trans->sorted[i],
+					   trans->paths + trans->sorted[i + 1]) > 0) {
+				swap(trans->sorted[i], trans->sorted[i + 1]);
+				trans->paths[trans->sorted[i]].sorted_idx = i;
+				trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1;
+				swapped = true;
+			}
+		}
+
+		if (inc > 0)
+			--r;
+		else
+			l++;
+		inc = -inc;
+	} while (swapped);
+
+	trans->paths_sorted = true;
+out:
+	btree_trans_verify_sorted(trans);
+}
+
+static inline void btree_path_list_remove(struct btree_trans *trans,
+					  struct btree_path *path)
+{
+	unsigned i;
+
+	EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	trans->nr_sorted--;
+	memmove_u64s_down_small(trans->sorted + path->sorted_idx,
+				trans->sorted + path->sorted_idx + 1,
+				DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+#else
+	array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
+#endif
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
+
+	path->sorted_idx = U8_MAX;
+}
+
+static inline void btree_path_list_add(struct btree_trans *trans,
+				       struct btree_path *pos,
+				       struct btree_path *path)
+{
+	unsigned i;
+
+	path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
+			      trans->sorted + path->sorted_idx,
+			      DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+	trans->nr_sorted++;
+	trans->sorted[path->sorted_idx] = path->idx;
+#else
+	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+#endif
+
+	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+		trans->paths[trans->sorted[i]].sorted_idx = i;
+
+	btree_trans_verify_sorted_refs(trans);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
+{
+	if (iter->update_path)
+		bch2_path_put_nokeep(trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->path)
+		bch2_path_put(trans, iter->path,
+			      iter->flags & BTREE_ITER_INTENT);
+	if (iter->key_cache_path)
+		bch2_path_put(trans, iter->key_cache_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->path = NULL;
+	iter->update_path = NULL;
+	iter->key_cache_path = NULL;
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  enum btree_id btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+			       bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       enum btree_id btree_id,
+			       struct bpos pos,
+			       unsigned locks_want,
+			       unsigned depth,
+			       unsigned flags)
+{
+	flags |= BTREE_ITER_NOT_EXTENTS;
+	flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+	flags |= BTREE_ITER_ALL_SNAPSHOTS;
+
+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
+			       __bch2_btree_iter_flags(trans, btree_id, flags),
+			       _RET_IP_);
+
+	iter->min_depth	= depth;
+
+	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
+	BUG_ON(iter->path->level	!= depth);
+	BUG_ON(iter->min_depth		!= depth);
+}
+
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+	*dst = *src;
+	if (src->path)
+		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+	if (src->update_path)
+		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+	dst->key_cache_path = NULL;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	unsigned new_top = trans->mem_top + size;
+	size_t old_bytes = trans->mem_bytes;
+	size_t new_bytes = roundup_pow_of_two(new_top);
+	int ret;
+	void *new_mem;
+	void *p;
+
+	trans->mem_max = max(trans->mem_max, new_top);
+
+	WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+	if (unlikely(!new_mem)) {
+		bch2_trans_unlock(trans);
+
+		new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+			new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+			new_bytes = BTREE_TRANS_MEM_MAX;
+			kfree(trans->mem);
+		}
+
+		if (!new_mem)
+			return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+		trans->mem = new_mem;
+		trans->mem_bytes = new_bytes;
+
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	trans->mem = new_mem;
+	trans->mem_bytes = new_bytes;
+
+	if (old_bytes) {
+		trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
+	}
+
+	p = trans->mem + trans->mem_top;
+	trans->mem_top += size;
+	memset(p, 0, size);
+	return p;
+}
+
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
+{
+	WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+	     "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+	     (jiffies - trans->srcu_lock_time) / HZ);
+}
+
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+	if (trans->srcu_held) {
+		struct bch_fs *c = trans->c;
+		struct btree_path *path;
+
+		trans_for_each_path(trans, path)
+			if (path->cached && !btree_node_locked(path, 0))
+				path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+		check_srcu_held_too_long(trans);
+		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+		trans->srcu_held = false;
+	}
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+	if (!trans->srcu_held) {
+		trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+		trans->srcu_lock_time	= jiffies;
+		trans->srcu_held = true;
+	}
+}
+
+/**
+ * bch2_trans_begin() - reset a transaction after a interrupted attempt
+ * @trans: transaction to reset
+ *
+ * Returns:	current restart counter, to be used with trans_was_restarted()
+ *
+ * While iterating over nodes or updating nodes a attempt to lock a btree node
+ * may return BCH_ERR_transaction_restart when the trylock fails. When this
+ * occurs bch2_trans_begin() should be called and the transaction retried.
+ */
+u32 bch2_trans_begin(struct btree_trans *trans)
+{
+	struct btree_path *path;
+	u64 now;
+
+	bch2_trans_reset_updates(trans);
+
+	trans->restart_count++;
+	trans->mem_top			= 0;
+
+	trans_for_each_path(trans, path) {
+		path->should_be_locked = false;
+
+		/*
+		 * If the transaction wasn't restarted, we're presuming to be
+		 * doing something new: dont keep iterators excpt the ones that
+		 * are in use - except for the subvolumes btree:
+		 */
+		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
+			path->preserve = false;
+
+		/*
+		 * XXX: we probably shouldn't be doing this if the transaction
+		 * was restarted, but currently we still overflow transaction
+		 * iterators if we do that
+		 */
+		if (!path->ref && !path->preserve)
+			__bch2_path_free(trans, path);
+		else
+			path->preserve = false;
+	}
+
+	now = local_clock();
+	if (!trans->restarted &&
+	    (need_resched() ||
+	     now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+		drop_locks_do(trans, (cond_resched(), 0));
+		now = local_clock();
+	}
+	trans->last_begin_time = now;
+
+	if (unlikely(trans->srcu_held &&
+		     time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+		bch2_trans_srcu_unlock(trans);
+
+	trans->last_begin_ip = _RET_IP_;
+	if (trans->restarted) {
+		bch2_btree_path_traverse_all(trans);
+		trans->notrace_relock_fail = false;
+	}
+
+	return trans->restart_count;
+}
+
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+
+	if (IS_ENABLED(__KERNEL__)) {
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+		if (trans)
+			return trans;
+	}
+
+	trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+	/*
+	 * paths need to be zeroed, bch2_check_for_deadlock looks at
+	 * paths in other threads
+	 */
+	memset(&trans->paths, 0, sizeof(trans->paths));
+	return trans;
+}
+
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+
+unsigned bch2_trans_get_fn_idx(const char *fn)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+		if (!bch2_btree_transaction_fns[i] ||
+		    bch2_btree_transaction_fns[i] == fn) {
+			bch2_btree_transaction_fns[i] = fn;
+			return i;
+		}
+
+	pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
+	return i;
+}
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
+	__acquires(&c->btree_trans_barrier)
+{
+	struct btree_trans *trans;
+	struct btree_transaction_stats *s;
+
+	trans = bch2_trans_alloc(c);
+
+	memset(trans, 0, sizeof(*trans));
+	trans->c		= c;
+	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
+		? bch2_btree_transaction_fns[fn_idx] : NULL;
+	trans->last_begin_time	= local_clock();
+	trans->fn_idx		= fn_idx;
+	trans->locking_wait.task = current;
+	trans->journal_replay_not_finished =
+		unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+		atomic_inc_not_zero(&c->journal_keys.ref);
+	closure_init_stack(&trans->ref);
+
+	s = btree_trans_stats(trans);
+	if (s && s->max_mem) {
+		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+
+		if (!unlikely(trans->mem)) {
+			trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+			trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+		} else {
+			trans->mem_bytes = expected_mem_bytes;
+		}
+	}
+
+	if (s) {
+		trans->nr_max_paths = s->nr_max_paths;
+		trans->wb_updates_size = s->wb_updates_size;
+	}
+
+	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
+	trans->srcu_lock_time	= jiffies;
+	trans->srcu_held	= true;
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		struct btree_trans *pos;
+
+		seqmutex_lock(&c->btree_trans_lock);
+		list_for_each_entry(pos, &c->btree_trans_list, list) {
+			/*
+			 * We'd much prefer to be stricter here and completely
+			 * disallow multiple btree_trans in the same thread -
+			 * but the data move path calls bch2_write when we
+			 * already have a btree_trans initialized.
+			 */
+			BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+			       bch2_trans_locked(pos));
+
+			if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+				list_add_tail(&trans->list, &pos->list);
+				goto list_add_done;
+			}
+		}
+		list_add_tail(&trans->list, &c->btree_trans_list);
+list_add_done:
+		seqmutex_unlock(&c->btree_trans_lock);
+	}
+
+	return trans;
+}
+
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			goto leaked;
+	return;
+leaked:
+	bch_err(c, "btree paths leaked from %s!", trans->fn);
+	trans_for_each_path(trans, path)
+		if (path->ref)
+			printk(KERN_ERR "  btree %s %pS\n",
+			       bch2_btree_id_str(path->btree_id),
+			       (void *) path->ip_allocated);
+	/* Be noisy about this: */
+	bch2_fatal_error(c);
+#endif
+}
+
+void bch2_trans_put(struct btree_trans *trans)
+	__releases(&c->btree_trans_barrier)
+{
+	struct btree_insert_entry *i;
+	struct bch_fs *c = trans->c;
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+	bch2_trans_unlock(trans);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+		seqmutex_lock(&c->btree_trans_lock);
+		list_del(&trans->list);
+		seqmutex_unlock(&c->btree_trans_lock);
+	}
+
+	closure_sync(&trans->ref);
+
+	if (s)
+		s->max_mem = max(s->max_mem, trans->mem_max);
+
+	trans_for_each_update(trans, i)
+		__btree_path_put(i->path, true);
+	trans->nr_updates		= 0;
+
+	check_btree_paths_leaked(trans);
+
+	if (trans->srcu_held) {
+		check_srcu_held_too_long(trans);
+		srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+	}
+
+	kfree(trans->extra_journal_entries.data);
+
+	if (trans->fs_usage_deltas) {
+		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
+		    REPLICAS_DELTA_LIST_MAX)
+			mempool_free(trans->fs_usage_deltas,
+				     &c->replicas_delta_pool);
+		else
+			kfree(trans->fs_usage_deltas);
+	}
+
+	if (unlikely(trans->journal_replay_not_finished))
+		bch2_journal_keys_put(c);
+
+	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
+	else
+		kfree(trans->mem);
+
+	/* Userspace doesn't have a real percpu implementation: */
+	if (IS_ENABLED(__KERNEL__))
+		trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+	if (trans)
+		mempool_free(trans, &c->btree_trans_pool);
+}
+
+static void __maybe_unused
+bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
+				      struct btree_bkey_cached_common *b)
+{
+	struct six_lock_count c = six_lock_counts(&b->lock);
+	struct task_struct *owner;
+	pid_t pid;
+
+	rcu_read_lock();
+	owner = READ_ONCE(b->lock.owner);
+	pid = owner ? owner->pid : 0;
+	rcu_read_unlock();
+
+	prt_tab(out);
+	prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
+		   b->level, bch2_btree_id_str(b->btree_id));
+	bch2_bpos_to_text(out, btree_node_pos(b));
+
+	prt_tab(out);
+	prt_printf(out, " locks %u:%u:%u held by pid %u",
+		   c.n[0], c.n[1], c.n[2], pid);
+}
+
+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
+{
+	struct btree_path *path;
+	struct btree_bkey_cached_common *b;
+	static char lock_types[] = { 'r', 'i', 'w' };
+	unsigned l, idx;
+
+	if (!out->nr_tabstops) {
+		printbuf_tabstop_push(out, 16);
+		printbuf_tabstop_push(out, 32);
+	}
+
+	prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+
+	trans_for_each_path_safe(trans, path, idx) {
+		if (!path->nodes_locked)
+			continue;
+
+		prt_printf(out, "  path %u %c l=%u %s:",
+		       path->idx,
+		       path->cached ? 'c' : 'b',
+		       path->level,
+		       bch2_btree_id_str(path->btree_id));
+		bch2_bpos_to_text(out, path->pos);
+		prt_newline(out);
+
+		for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+			if (btree_node_locked(path, l) &&
+			    !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
+				prt_printf(out, "    %c l=%u ",
+					   lock_types[btree_node_locked_type(path, l)], l);
+				bch2_btree_bkey_cached_common_to_text(out, b);
+				prt_newline(out);
+			}
+		}
+	}
+
+	b = READ_ONCE(trans->locking);
+	if (b) {
+		prt_printf(out, "  blocked for %lluus on",
+			   div_u64(local_clock() - trans->locking_wait.start_time,
+				   1000));
+		prt_newline(out);
+		prt_printf(out, "    %c", lock_types[trans->locking_wait.lock_want]);
+		bch2_btree_bkey_cached_common_to_text(out, b);
+		prt_newline(out);
+	}
+}
+
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
+{
+	struct btree_transaction_stats *s;
+	struct btree_trans *trans;
+	int cpu;
+
+	trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+	if (trans)
+		panic("%s leaked btree_trans\n", trans->fn);
+
+	if (c->btree_trans_bufs)
+		for_each_possible_cpu(cpu)
+			kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+	free_percpu(c->btree_trans_bufs);
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		kfree(s->max_paths_text);
+		bch2_time_stats_exit(&s->lock_hold_times);
+	}
+
+	if (c->btree_trans_barrier_initialized)
+		cleanup_srcu_struct(&c->btree_trans_barrier);
+	mempool_exit(&c->btree_trans_mem_pool);
+	mempool_exit(&c->btree_trans_pool);
+}
+
+void bch2_fs_btree_iter_init_early(struct bch_fs *c)
+{
+	struct btree_transaction_stats *s;
+
+	for (s = c->btree_transaction_stats;
+	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
+	     s++) {
+		bch2_time_stats_init(&s->lock_hold_times);
+		mutex_init(&s->lock);
+	}
+
+	INIT_LIST_HEAD(&c->btree_trans_list);
+	seqmutex_init(&c->btree_trans_lock);
+}
+
+int bch2_fs_btree_iter_init(struct bch_fs *c)
+{
+	int ret;
+
+	c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+	if (!c->btree_trans_bufs)
+		return -ENOMEM;
+
+	ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+					  sizeof(struct btree_trans)) ?:
+		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+					  BTREE_TRANS_MEM_MAX) ?:
+		init_srcu_struct(&c->btree_trans_barrier);
+	if (!ret)
+		c->btree_trans_barrier_initialized = true;
+	return ret;
+}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
new file mode 100644
index 000000000000..eaffced4c132
--- /dev/null
+++ b/fs/bcachefs/btree_iter.h
@@ -0,0 +1,944 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "bset.h"
+#include "btree_types.h"
+#include "trace.h"
+
+static inline int __bkey_err(const struct bkey *k)
+{
+	return PTR_ERR_OR_ZERO(k);
+}
+
+#define bkey_err(_k)	__bkey_err((_k).k)
+
+static inline void __btree_path_get(struct btree_path *path, bool intent)
+{
+	path->ref++;
+	path->intent_ref += intent;
+}
+
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+	EBUG_ON(!path->ref);
+	EBUG_ON(!path->intent_ref && intent);
+	path->intent_ref -= intent;
+	return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+					enum btree_path_uptodate u)
+{
+	path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
+					    unsigned level)
+{
+	return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
+}
+
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+					const struct btree *b, unsigned level)
+{
+	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
+}
+
+static inline struct btree *btree_node_parent(struct btree_path *path,
+					      struct btree *b)
+{
+	return btree_path_node(path, b->c.level + 1);
+}
+
+/* Iterate over paths within a transaction: */
+
+void __bch2_btree_trans_sort_paths(struct btree_trans *);
+
+static inline void btree_trans_sort_paths(struct btree_trans *trans)
+{
+	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    trans->paths_sorted)
+		return;
+	__bch2_btree_trans_sort_paths(trans);
+}
+
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
+{
+	u64 l;
+
+	if (idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->paths_allocated >> idx;
+	if (!l)
+		return NULL;
+
+	idx += __ffs64(l);
+	EBUG_ON(idx >= BTREE_ITER_MAX);
+	EBUG_ON(trans->paths[idx].idx != idx);
+	return &trans->paths[idx];
+}
+
+#define trans_for_each_path_from(_trans, _path, _start)			\
+	for (_path = __trans_next_path((_trans), _start);		\
+	     (_path);							\
+	     _path = __trans_next_path((_trans), (_path)->idx + 1))
+
+#define trans_for_each_path(_trans, _path)				\
+	trans_for_each_path_from(_trans, _path, 0)
+
+static inline struct btree_path *
+__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+{
+	u64 l;
+
+	if (*idx == BTREE_ITER_MAX)
+		return NULL;
+
+	l = trans->paths_allocated >> *idx;
+	if (!l)
+		return NULL;
+
+	*idx += __ffs64(l);
+	EBUG_ON(*idx >= BTREE_ITER_MAX);
+	return &trans->paths[*idx];
+}
+
+/*
+ * This version is intended to be safe for use on a btree_trans that is owned by
+ * another thread, for bch2_btree_trans_to_text();
+ */
+#define trans_for_each_path_safe_from(_trans, _path, _idx, _start)	\
+	for (_idx = _start;						\
+	     (_path = __trans_next_path_safe((_trans), &_idx));		\
+	     _idx++)
+
+#define trans_for_each_path_safe(_trans, _path, _idx)			\
+	trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned idx = path ? path->sorted_idx + 1 : 0;
+
+	EBUG_ON(idx > trans->nr_sorted);
+
+	return idx < trans->nr_sorted
+		? trans->paths + trans->sorted[idx]
+		: NULL;
+}
+
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
+{
+	unsigned idx = path ? path->sorted_idx : trans->nr_sorted;
+
+	return idx
+		? trans->paths + trans->sorted[idx - 1]
+		: NULL;
+}
+
+#define trans_for_each_path_inorder(_trans, _path, _i)			\
+	for (_i = 0;							\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+	     _i++)
+
+#define trans_for_each_path_inorder_reverse(_trans, _path, _i)		\
+	for (_i = trans->nr_sorted - 1;					\
+	     ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\
+	     --_i)
+
+static inline bool __path_has_node(const struct btree_path *path,
+				   const struct btree *b)
+{
+	return path->l[b->c.level].b == b &&
+		btree_node_lock_seq_matches(path, b, b->c.level);
+}
+
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
+			    unsigned idx)
+{
+	struct btree_path *path = __trans_next_path(trans, idx);
+
+	while (path && !__path_has_node(path, b))
+		path = __trans_next_path(trans, path->idx + 1);
+
+	return path;
+}
+
+#define trans_for_each_path_with_node(_trans, _b, _path)		\
+	for (_path = __trans_next_path_with_node((_trans), (_b), 0);	\
+	     (_path);							\
+	     _path = __trans_next_path_with_node((_trans), (_b),	\
+						 (_path)->idx + 1))
+
+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+			 bool, unsigned long);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+			 struct btree_path *path, bool intent,
+			 unsigned long ip)
+{
+	if (path->ref > 1 || path->preserve)
+		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
+	path->should_be_locked = false;
+	return path;
+}
+
+struct btree_path * __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+			struct bpos, bool, unsigned long, int);
+
+static inline struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+		   struct btree_path *path, struct bpos new_pos,
+		   bool intent, unsigned long ip)
+{
+	int cmp = bpos_cmp(new_pos, path->pos);
+
+	return cmp
+		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+		: path;
+}
+
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+					      unsigned, unsigned long);
+
+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+					  struct btree_path *path, unsigned flags)
+{
+	if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+		return 0;
+
+	return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
+}
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+					  struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+				 unsigned, unsigned, unsigned, unsigned long);
+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+	if (k.k && bpos_eq(path->pos, k.k->p))
+		return k;
+
+	bkey_init(u);
+	u->p = path->pos;
+	return (struct bkey_s_c) { u, NULL };
+}
+
+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
+					struct btree_iter *, struct bpos);
+
+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *);
+
+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *);
+
+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock)
+{
+	return mutex_trylock(lock)
+		? 0
+		: __bch2_trans_mutex_lock(trans, lock);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+			    struct bpos, bool);
+#else
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+					  struct bpos pos, bool key_cache) {}
+#endif
+
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+				      struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+			      struct btree *, struct btree_node_iter *,
+			      struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+
+int bch2_trans_relock(struct btree_trans *);
+int bch2_trans_relock_notrace(struct btree_trans *);
+void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
+bool bch2_trans_locked(struct btree_trans *);
+
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+{
+	return restart_count != trans->restart_count
+		? -BCH_ERR_transaction_restart_nested
+		: 0;
+}
+
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
+
+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
+						   u32 restart_count)
+{
+	if (trans_was_restarted(trans, restart_count))
+		bch2_trans_restart_error(trans, restart_count);
+}
+
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
+
+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
+{
+	if (trans->restarted)
+		bch2_trans_in_restart_error(trans);
+}
+
+__always_inline
+static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
+{
+	BUG_ON(err <= 0);
+	BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
+
+	trans->restarted = err;
+	trans->last_restarted_ip = _THIS_IP_;
+	return -err;
+}
+
+__always_inline
+static int btree_trans_restart(struct btree_trans *trans, int err)
+{
+	btree_trans_restart_nounlock(trans, err);
+	return -err;
+}
+
+bool bch2_btree_node_upgrade(struct btree_trans *,
+			     struct btree_path *, unsigned);
+
+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
+
+static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
+					     struct btree_path *path)
+{
+	unsigned new_locks_want = path->level + !!path->intent_ref;
+
+	if (path->locks_want > new_locks_want)
+		__bch2_btree_path_downgrade(trans, path, new_locks_want);
+}
+
+void bch2_trans_downgrade(struct btree_trans *);
+
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
+}
+
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
+
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	iter->k.type = KEY_TYPE_deleted;
+	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
+	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
+	iter->k.p.snapshot	= iter->pos.snapshot	= new_pos.snapshot;
+	iter->k.size = 0;
+}
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+	if (unlikely(iter->update_path))
+		bch2_path_put(iter->trans, iter->update_path,
+			      iter->flags & BTREE_ITER_INTENT);
+	iter->update_path = NULL;
+
+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+		new_pos.snapshot = iter->snapshot;
+
+	__bch2_btree_iter_set_pos(iter, new_pos);
+}
+
+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
+{
+	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
+	iter->pos = bkey_start_pos(&iter->k);
+}
+
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+	struct bpos pos = iter->pos;
+
+	iter->snapshot = snapshot;
+	pos.snapshot = snapshot;
+	bch2_btree_iter_set_pos(iter, pos);
+}
+
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+
+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
+					       unsigned btree_id,
+					       unsigned flags)
+{
+	if (flags & BTREE_ITER_ALL_LEVELS)
+		flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+	    btree_id_is_extents(btree_id))
+		flags |= BTREE_ITER_IS_EXTENTS;
+
+	if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+	    !btree_type_has_snapshot_field(btree_id))
+		flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+	if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+	    btree_type_has_snapshots(btree_id))
+		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+
+	if (trans->journal_replay_not_finished)
+		flags |= BTREE_ITER_WITH_JOURNAL;
+
+	return flags;
+}
+
+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
+					     unsigned btree_id,
+					     unsigned flags)
+{
+	if (!btree_id_cached(trans->c, btree_id)) {
+		flags &= ~BTREE_ITER_CACHED;
+		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+	} else if (!(flags & BTREE_ITER_CACHED))
+		flags |= BTREE_ITER_WITH_KEY_CACHE;
+
+	return __bch2_btree_iter_flags(trans, btree_id, flags);
+}
+
+static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
+					  struct btree_iter *iter,
+					  unsigned btree_id, struct bpos pos,
+					  unsigned locks_want,
+					  unsigned depth,
+					  unsigned flags,
+					  unsigned long ip)
+{
+	memset(iter, 0, sizeof(*iter));
+	iter->trans	= trans;
+	iter->btree_id	= btree_id;
+	iter->flags	= flags;
+	iter->snapshot	= pos.snapshot;
+	iter->pos	= pos;
+	iter->k.p	= pos;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	iter->ip_allocated = ip;
+#endif
+	iter->path = bch2_path_get(trans, btree_id, iter->pos,
+				   locks_want, depth, flags, ip);
+}
+
+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
+			  enum btree_id, struct bpos, unsigned);
+
+static inline void bch2_trans_iter_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  unsigned btree_id, struct bpos pos,
+			  unsigned flags)
+{
+	if (__builtin_constant_p(btree_id) &&
+	    __builtin_constant_p(flags))
+		bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
+				bch2_btree_iter_flags(trans, btree_id, flags),
+				_THIS_IP_);
+	else
+		bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
+}
+
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+			       enum btree_id, struct bpos,
+			       unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
+{
+	if (!iter->trans->restarted)
+		iter->path->preserve = false;
+}
+
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+
+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		memset(p, 0, size);
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
+}
+
+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+	size = roundup(size, 8);
+
+	if (likely(trans->mem_top + size <= trans->mem_bytes)) {
+		void *p = trans->mem + trans->mem_top;
+
+		trans->mem_top += size;
+		return p;
+	} else {
+		return __bch2_trans_kmalloc(trans, size);
+	}
+}
+
+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type)
+{
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
+	k = bch2_btree_iter_peek_slot(iter);
+
+	if (!bkey_err(k) && type && k.k->type != type)
+		k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
+	if (unlikely(bkey_err(k)))
+		bch2_trans_iter_exit(trans, iter);
+	return k;
+}
+
+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
+				struct btree_iter *iter,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags)
+{
+	return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0);
+}
+
+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
+				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
+
+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
+				unsigned btree_id, struct bpos pos,
+				unsigned flags, unsigned type,
+				unsigned val_size, void *val)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
+	ret = bkey_err(k);
+	if (!ret) {
+		unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
+
+		memcpy(val, k.v, b);
+		if (unlikely(b < sizeof(*val)))
+			memset((void *) val + b, 0, sizeof(*val) - b);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
+}
+
+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\
+	__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,	\
+				  KEY_TYPE_##_type, sizeof(*_val), _val)
+
+void bch2_trans_srcu_unlock(struct btree_trans *);
+void bch2_trans_srcu_lock(struct btree_trans *);
+
+u32 bch2_trans_begin(struct btree_trans *);
+
+/*
+ * XXX
+ * this does not handle transaction restarts from bch2_btree_iter_next_node()
+ * correctly
+ */
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      _locks_want, _depth, _flags, _b, _ret)	\
+	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
+				_start, _locks_want, _depth, _flags);	\
+	     (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)),	\
+	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
+	     (_b) = bch2_btree_iter_next_node(&(_iter)))
+
+#define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			    _flags, _b, _ret)				\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
+			      0, 0, _flags, _b, _ret)
+
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+							     unsigned flags)
+{
+	BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
+
+	return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek_prev(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+							unsigned flags)
+{
+	return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+		flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+						bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
+							     struct bpos end,
+							     unsigned flags)
+{
+	if (!(flags & BTREE_ITER_SLOTS))
+		return bch2_btree_iter_peek_upto(iter, end);
+
+	if (bkey_gt(iter->pos, end))
+		return bkey_s_c_null;
+
+	return bch2_btree_iter_peek_slot(iter);
+}
+
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
+{
+	if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+		trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+	}
+
+	return 0;
+}
+
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+				   struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(trans) ||
+	       (k = bch2_btree_iter_peek_type(iter, flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(trans);
+
+	return k;
+}
+
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct bpos end,
+					unsigned flags)
+{
+	struct bkey_s_c k;
+
+	while (btree_trans_too_many_iters(trans) ||
+	       (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
+		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+		bch2_trans_begin(trans);
+
+	return k;
+}
+
+#define lockrestart_do(_trans, _do)					\
+({									\
+	u32 _restart_count;						\
+	int _ret2;							\
+									\
+	do {								\
+		_restart_count = bch2_trans_begin(_trans);		\
+		_ret2 = (_do);						\
+	} while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart));	\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret2;								\
+})
+
+/*
+ * nested_lockrestart_do(), nested_commit_do():
+ *
+ * These are like lockrestart_do() and commit_do(), with two differences:
+ *
+ *  - We don't call bch2_trans_begin() unless we had a transaction restart
+ *  - We return -BCH_ERR_transaction_restart_nested if we succeeded after a
+ *  transaction restart
+ */
+#define nested_lockrestart_do(_trans, _do)				\
+({									\
+	u32 _restart_count, _orig_restart_count;			\
+	int _ret2;							\
+									\
+	_restart_count = _orig_restart_count = (_trans)->restart_count;	\
+									\
+	while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
+		_restart_count = bch2_trans_begin(_trans);		\
+									\
+	if (!_ret2)							\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+									\
+	_ret2 ?: trans_was_restarted(_trans, _restart_count);		\
+})
+
+#define for_each_btree_key2(_trans, _iter, _btree_id,			\
+			    _start, _flags, _k, _do)			\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret3 = 0;						\
+		(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));	\
+		if (!(_k).k)						\
+			break;						\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key2_upto(_trans, _iter, _btree_id,		\
+			    _start, _end, _flags, _k, _do)		\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+									\
+		_ret3 = 0;						\
+		(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
+		if (!(_k).k)						\
+			break;						\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_advance(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_reverse(_trans, _iter, _btree_id,		\
+				   _start, _flags, _k, _do)		\
+({									\
+	int _ret3 = 0;							\
+									\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),		\
+			     (_start), (_flags));			\
+									\
+	while (1) {							\
+		u32 _restart_count = bch2_trans_begin(_trans);		\
+		(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
+		if (!(_k).k) {						\
+			_ret3 = 0;					\
+			break;						\
+		}							\
+									\
+		_ret3 = bkey_err(_k) ?: (_do);				\
+		if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
+			continue;					\
+		if (_ret3)						\
+			break;						\
+		bch2_trans_verify_not_restarted(_trans, _restart_count);\
+		if (!bch2_btree_iter_rewind(&(_iter)))			\
+			break;						\
+	}								\
+									\
+	bch2_trans_iter_exit((_trans), &(_iter));			\
+	_ret3;								\
+})
+
+#define for_each_btree_key_commit(_trans, _iter, _btree_id,		\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id,	\
+				  _start, _iter_flags, _k,		\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id,	\
+				  _start, _end, _iter_flags, _k,	\
+				  _disk_res, _journal_seq, _commit_flags,\
+				  _do)					\
+	for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+			    (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_commit_flags)))
+
+#define for_each_btree_key(_trans, _iter, _btree_id,			\
+			   _start, _flags, _k, _ret)			\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto(_trans, _iter, _btree_id,		\
+				_start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans),	\
+						&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id,		\
+			   _start, _flags, _k, _ret)			\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
+			   _start, _end, _flags, _k, _ret)		\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
+	for (;								\
+	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
+	for (;									\
+	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;				\
+	     bch2_btree_iter_advance(&(_iter)))
+
+#define drop_locks_do(_trans, _do)					\
+({									\
+	bch2_trans_unlock(_trans);					\
+	_do ?: bch2_trans_relock(_trans);				\
+})
+
+#define allocate_dropping_locks_errcode(_trans, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	int _ret = _do;							\
+									\
+	if (bch2_err_matches(_ret, ENOMEM)) {				\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, _do);			\
+	}								\
+	_ret;								\
+})
+
+#define allocate_dropping_locks(_trans, _ret, _do)			\
+({									\
+	gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN;				\
+	typeof(_do) _p = _do;						\
+									\
+	_ret = 0;							\
+	if (unlikely(!_p)) {						\
+		_gfp = GFP_KERNEL;					\
+		_ret = drop_locks_do(trans, ((_p = _do), 0));		\
+	}								\
+	_p;								\
+})
+
+/* new multiple iterator interface: */
+
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
+
+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+unsigned bch2_trans_get_fn_idx(const char *);
+
+#define bch2_trans_get(_c)						\
+({									\
+	static unsigned trans_fn_idx;					\
+									\
+	if (unlikely(!trans_fn_idx))					\
+		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
+	__bch2_trans_get(_c, trans_fn_idx);				\
+})
+
+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
+
+void bch2_fs_btree_iter_exit(struct bch_fs *);
+void bch2_fs_btree_iter_init_early(struct bch_fs *);
+int bch2_fs_btree_iter_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
new file mode 100644
index 000000000000..ec52f50d249d
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id	l_btree_id,
+			     unsigned		l_level,
+			     struct bpos	l_pos,
+			     const struct journal_key *r)
+{
+	return (cmp_int(l_btree_id,	r->btree_id) ?:
+		cmp_int(l_level,	r->level) ?:
+		bpos_cmp(l_pos,	r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+	size_t gap_size = keys->size - keys->nr;
+
+	if (idx >= keys->gap)
+		idx += gap_size;
+	return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+	return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+					enum btree_id id, unsigned level,
+					struct bpos pos)
+{
+	size_t l = 0, r = keys->nr, m;
+
+	while (l < r) {
+		m = l + ((r - l) >> 1);
+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	BUG_ON(l < keys->nr &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+	BUG_ON(l &&
+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+	return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+				      enum btree_id id, unsigned level,
+				      struct bpos pos)
+{
+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos,
+					   struct bpos end_pos, size_t *idx)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	unsigned iters = 0;
+	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
+search:
+	if (!*idx)
+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+			return NULL;
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+		    !k->overwritten)
+			return k->k;
+
+		(*idx)++;
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
+	return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+					   unsigned level, struct bpos pos)
+{
+	size_t idx = 0;
+
+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	/* The key we just inserted is immediately before the gap: */
+	size_t gap_end = keys->gap + (keys->size - keys->nr);
+	struct btree_and_journal_iter *iter;
+
+	/*
+	 * If an iterator points one after the key we just inserted, decrement
+	 * the iterator so it points at the key we just inserted - if the
+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+	 * handle that:
+	 */
+	list_for_each_entry(iter, &c->journal_iters, journal.list)
+		if (iter->journal.idx == gap_end)
+			iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_iter *iter;
+	size_t gap_size = keys->size - keys->nr;
+
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		if (iter->idx > old_gap)
+			iter->idx -= gap_size;
+		if (iter->idx >= new_gap)
+			iter->idx += gap_size;
+	}
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+				 unsigned level, struct bkey_i *k)
+{
+	struct journal_key n = {
+		.btree_id	= id,
+		.level		= level,
+		.k		= k,
+		.allocated	= true,
+		/*
+		 * Ensure these keys are done last by journal replay, to unblock
+		 * journal reclaim:
+		 */
+		.journal_seq	= U32_MAX,
+	};
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+	if (idx < keys->size &&
+	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
+		if (keys->d[idx].allocated)
+			kfree(keys->d[idx].k);
+		keys->d[idx] = n;
+		return 0;
+	}
+
+	if (idx > keys->gap)
+		idx -= keys->size - keys->nr;
+
+	if (keys->nr == keys->size) {
+		struct journal_keys new_keys = {
+			.nr			= keys->nr,
+			.size			= max_t(size_t, keys->size, 8) * 2,
+		};
+
+		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+		if (!new_keys.d) {
+			bch_err(c, "%s: error allocating new key array (size %zu)",
+				__func__, new_keys.size);
+			return -BCH_ERR_ENOMEM_journal_key_insert;
+		}
+
+		/* Since @keys was full, there was no gap: */
+		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+		kvfree(keys->d);
+		keys->d		= new_keys.d;
+		keys->nr	= new_keys.nr;
+		keys->size	= new_keys.size;
+
+		/* And now the gap is at the end: */
+		keys->gap	= keys->nr;
+	}
+
+	journal_iters_move_gap(c, keys->gap, idx);
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+	keys->gap = idx;
+
+	keys->nr++;
+	keys->d[keys->gap++] = n;
+
+	journal_iters_fix(c);
+
+	return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bkey_i *k)
+{
+	struct bkey_i *n;
+	int ret;
+
+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_key_insert;
+
+	bkey_copy(n, k);
+	ret = bch2_journal_key_insert_take(c, id, level, n);
+	if (ret)
+		kfree(n);
+	return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+			    unsigned level, struct bpos pos)
+{
+	struct bkey_i whiteout;
+
+	bkey_init(&whiteout.k);
+	whiteout.k.p = pos;
+
+	return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+				  unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (idx < keys->size &&
+	    keys->d[idx].btree_id	== btree &&
+	    keys->d[idx].level		== level &&
+	    bpos_eq(keys->d[idx].k->k.p, pos))
+		keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+	if (iter->idx < iter->keys->size) {
+		iter->idx++;
+		if (iter->idx == iter->keys->gap)
+			iter->idx += iter->keys->size - iter->keys->nr;
+	}
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+	struct journal_key *k = iter->keys->d + iter->idx;
+
+	while (k < iter->keys->d + iter->keys->size &&
+	       k->btree_id	== iter->btree_id &&
+	       k->level		== iter->level) {
+		if (!k->overwritten)
+			return bkey_i_to_s_c(k->k);
+
+		bch2_journal_iter_advance(iter);
+		k = iter->keys->d + iter->idx;
+	}
+
+	return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+	list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+				   struct journal_iter *iter,
+				   enum btree_id id, unsigned level,
+				   struct bpos pos)
+{
+	iter->btree_id	= id;
+	iter->level	= level;
+	iter->keys	= &c->journal_keys;
+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+						iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+	if (bpos_eq(iter->pos, SPOS_MAX))
+		iter->at_end = true;
+	else
+		iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+	struct bkey_s_c btree_k, journal_k, ret;
+again:
+	if (iter->at_end)
+		return bkey_s_c_null;
+
+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+	       bpos_lt(btree_k.k->p, iter->pos))
+		bch2_journal_iter_advance_btree(iter);
+
+	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+	       bpos_lt(journal_k.k->p, iter->pos))
+		bch2_journal_iter_advance(&iter->journal);
+
+	ret = journal_k.k &&
+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+		? journal_k
+		: btree_k;
+
+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+		ret = bkey_s_c_null;
+
+	if (ret.k) {
+		iter->pos = ret.k->p;
+		if (bkey_deleted(ret.k)) {
+			bch2_btree_and_journal_iter_advance(iter);
+			goto again;
+		}
+	} else {
+		iter->pos = SPOS_MAX;
+		iter->at_end = true;
+	}
+
+	return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+	bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						  struct bch_fs *c,
+						  struct btree *b,
+						  struct btree_node_iter node_iter,
+						  struct bpos pos)
+{
+	memset(iter, 0, sizeof(*iter));
+
+	iter->b = b;
+	iter->node_iter = node_iter;
+	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+	INIT_LIST_HEAD(&iter->journal.list);
+	iter->pos = b->data->min_key;
+	iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+						struct bch_fs *c,
+						struct btree *b)
+{
+	struct btree_node_iter node_iter;
+
+	bch2_btree_node_iter_init_from_start(&node_iter, b);
+	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+	list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+	struct journal_replay **i;
+	struct genradix_iter iter;
+
+	genradix_for_each(&c->journal_entries, iter, i)
+		if (*i)
+			kvpfree(*i, offsetof(struct journal_replay, j) +
+				vstruct_bytes(&(*i)->j));
+	genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = _l;
+	const struct journal_key *r = _r;
+
+	return  journal_key_cmp(l, r) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_put(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key *i;
+
+	BUG_ON(atomic_read(&keys->ref) <= 0);
+
+	if (!atomic_dec_and_test(&keys->ref))
+		return;
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->allocated)
+			kfree(i->k);
+
+	kvfree(keys->d);
+	keys->d = NULL;
+	keys->nr = keys->gap = keys->size = 0;
+
+	bch2_journal_entries_free(c);
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+	struct journal_key *src, *dst;
+
+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+	src = dst = keys->d;
+	while (src < keys->d + keys->nr) {
+		while (src + 1 < keys->d + keys->nr &&
+		       src[0].btree_id	== src[1].btree_id &&
+		       src[0].level	== src[1].level &&
+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
+			src++;
+
+		*dst++ = *src++;
+	}
+
+	keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+	struct genradix_iter iter;
+	struct journal_replay *i, **_i;
+	struct jset_entry *entry;
+	struct bkey_i *k;
+	struct journal_keys *keys = &c->journal_keys;
+	size_t nr_keys = 0, nr_read = 0;
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		for_each_jset_key(k, entry, &i->j)
+			nr_keys++;
+	}
+
+	if (!nr_keys)
+		return 0;
+
+	keys->size = roundup_pow_of_two(nr_keys);
+
+	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+	if (!keys->d) {
+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+			nr_keys);
+
+		do {
+			keys->size >>= 1;
+			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+		} while (!keys->d && keys->size > nr_keys / 8);
+
+		if (!keys->d) {
+			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+				keys->size);
+			return -BCH_ERR_ENOMEM_journal_keys_sort;
+		}
+	}
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		cond_resched();
+
+		for_each_jset_key(k, entry, &i->j) {
+			if (keys->nr == keys->size) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr > keys->size * 7 / 8) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+						keys->nr, keys->size, nr_read, nr_keys);
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+			}
+
+			keys->d[keys->nr++] = (struct journal_key) {
+				.btree_id	= entry->btree_id,
+				.level		= entry->level,
+				.k		= k,
+				.journal_seq	= le64_to_cpu(i->j.seq),
+				.journal_offset	= k->_data - i->j._data,
+			};
+
+			nr_read++;
+		}
+	}
+
+	__journal_keys_sort(keys);
+	keys->gap = keys->nr;
+
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+	return 0;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
new file mode 100644
index 000000000000..8ca4c100b2e3
--- /dev/null
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	size_t			idx;
+	struct journal_keys	*keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+	struct btree		*b;
+	struct btree_node_iter	node_iter;
+	struct bkey		unpacked;
+
+	struct journal_iter	journal;
+	struct bpos		pos;
+	bool			at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+				unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+					   unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+				 unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+			    unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+			    unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+				  unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+				struct bch_fs *, struct btree *,
+				struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+						struct bch_fs *,
+						struct btree *);
+
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+	if (c->journal_keys.initial_ref_held)
+		bch2_journal_keys_put(c);
+	c->journal_keys.initial_ref_held = false;
+}
+
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
new file mode 100644
index 000000000000..1b7a5668df7c
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.c
@@ -0,0 +1,1074 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+	return id == BTREE_ID_subvolumes;
+}
+
+static struct kmem_cache *bch2_key_cache;
+
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				       const void *obj)
+{
+	const struct bkey_cached *ck = obj;
+	const struct bkey_cached_key *key = arg->key;
+
+	return ck->key.btree_id != key->btree_id ||
+		!bpos_eq(ck->key.pos, key->pos);
+}
+
+static const struct rhashtable_params bch2_btree_key_cache_params = {
+	.head_offset	= offsetof(struct bkey_cached, hash),
+	.key_offset	= offsetof(struct bkey_cached, key),
+	.key_len	= sizeof(struct bkey_cached_key),
+	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
+};
+
+__flatten
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+{
+	struct bkey_cached_key key = {
+		.btree_id	= btree_id,
+		.pos		= pos,
+	};
+
+	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
+				      bch2_btree_key_cache_params);
+}
+
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
+{
+	if (!six_trylock_intent(&ck->c.lock))
+		return false;
+
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	if (!six_trylock_write(&ck->c.lock)) {
+		six_unlock_intent(&ck->c.lock);
+		return false;
+	}
+
+	return true;
+}
+
+static void bkey_cached_evict(struct btree_key_cache *c,
+			      struct bkey_cached *ck)
+{
+	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
+				      bch2_btree_key_cache_params));
+	memset(&ck->key, ~0, sizeof(ck->key));
+
+	atomic_long_dec(&c->nr_keys);
+}
+
+static void bkey_cached_free(struct btree_key_cache *bc,
+			     struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	if (ck->c.lock.readers) {
+		list_move_tail(&ck->list, &bc->freed_pcpu);
+		bc->nr_freed_pcpu++;
+	} else {
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		bc->nr_freed_nonpcpu++;
+	}
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
+#ifdef __KERNEL__
+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
+						   struct bkey_cached *ck)
+{
+	struct bkey_cached *pos;
+
+	bc->nr_freed_nonpcpu++;
+
+	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
+		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
+				 pos->btree_trans_barrier_seq)) {
+			list_move(&ck->list, &pos->list);
+			return;
+		}
+	}
+
+	list_move(&ck->list, &bc->freed_nonpcpu);
+}
+#endif
+
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+					 struct bkey_cached *ck)
+{
+	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+	if (!ck->c.lock.readers) {
+#ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+		bool freed = false;
+
+		preempt_disable();
+		f = this_cpu_ptr(bc->pcpu_freed);
+
+		if (f->nr < ARRAY_SIZE(f->objs)) {
+			f->objs[f->nr++] = ck;
+			freed = true;
+		}
+		preempt_enable();
+
+		if (!freed) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
+
+			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+				struct bkey_cached *ck2 = f->objs[--f->nr];
+
+				__bkey_cached_move_to_freelist_ordered(bc, ck2);
+			}
+			preempt_enable();
+
+			__bkey_cached_move_to_freelist_ordered(bc, ck);
+			mutex_unlock(&bc->lock);
+		}
+#else
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_nonpcpu);
+		bc->nr_freed_nonpcpu++;
+		mutex_unlock(&bc->lock);
+#endif
+	} else {
+		mutex_lock(&bc->lock);
+		list_move_tail(&ck->list, &bc->freed_pcpu);
+		mutex_unlock(&bc->lock);
+	}
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+				  struct bkey_cached *ck)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+	ck->btree_trans_barrier_seq =
+		start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+	list_del_init(&ck->list);
+	atomic_long_inc(&bc->nr_freed);
+
+	kfree(ck->k);
+	ck->k		= NULL;
+	ck->u64s	= 0;
+
+	bkey_cached_move_to_freelist(bc, ck);
+
+	six_unlock_write(&ck->c.lock);
+	six_unlock_intent(&ck->c.lock);
+}
+
+static struct bkey_cached *
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
+		  bool *was_new)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck = NULL;
+	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
+	int ret;
+
+	if (!pcpu_readers) {
+#ifdef __KERNEL__
+		struct btree_key_cache_freelist *f;
+
+		preempt_disable();
+		f = this_cpu_ptr(bc->pcpu_freed);
+		if (f->nr)
+			ck = f->objs[--f->nr];
+		preempt_enable();
+
+		if (!ck) {
+			mutex_lock(&bc->lock);
+			preempt_disable();
+			f = this_cpu_ptr(bc->pcpu_freed);
+
+			while (!list_empty(&bc->freed_nonpcpu) &&
+			       f->nr < ARRAY_SIZE(f->objs) / 2) {
+				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+				list_del_init(&ck->list);
+				bc->nr_freed_nonpcpu--;
+				f->objs[f->nr++] = ck;
+			}
+
+			ck = f->nr ? f->objs[--f->nr] : NULL;
+			preempt_enable();
+			mutex_unlock(&bc->lock);
+		}
+#else
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_nonpcpu)) {
+			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+			list_del_init(&ck->list);
+			bc->nr_freed_nonpcpu--;
+		}
+		mutex_unlock(&bc->lock);
+#endif
+	} else {
+		mutex_lock(&bc->lock);
+		if (!list_empty(&bc->freed_pcpu)) {
+			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
+			list_del_init(&ck->list);
+		}
+		mutex_unlock(&bc->lock);
+	}
+
+	if (ck) {
+		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
+		if (unlikely(ret)) {
+			bkey_cached_move_to_freelist(bc, ck);
+			return ERR_PTR(ret);
+		}
+
+		path->l[0].b = (void *) ck;
+		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+
+		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+		if (unlikely(ret)) {
+			btree_node_unlock(trans, path, 0);
+			bkey_cached_move_to_freelist(bc, ck);
+			return ERR_PTR(ret);
+		}
+
+		return ck;
+	}
+
+	ck = allocate_dropping_locks(trans, ret,
+			kmem_cache_zalloc(bch2_key_cache, _gfp));
+	if (ret) {
+		kmem_cache_free(bch2_key_cache, ck);
+		return ERR_PTR(ret);
+	}
+
+	if (!ck)
+		return NULL;
+
+	INIT_LIST_HEAD(&ck->list);
+	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+
+	ck->c.cached = true;
+	BUG_ON(!six_trylock_intent(&ck->c.lock));
+	BUG_ON(!six_trylock_write(&ck->c.lock));
+	*was_new = true;
+	return ck;
+}
+
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+	struct bucket_table *tbl;
+	struct rhash_head *pos;
+	struct bkey_cached *ck;
+	unsigned i;
+
+	mutex_lock(&c->lock);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+	for (i = 0; i < tbl->size; i++)
+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+			    bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(c, ck);
+				goto out;
+			}
+		}
+	ck = NULL;
+out:
+	rcu_read_unlock();
+	mutex_unlock(&c->lock);
+	return ck;
+}
+
+static struct bkey_cached *
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bkey_cached *ck;
+	bool was_new = false;
+
+	ck = bkey_cached_alloc(trans, path, &was_new);
+	if (IS_ERR(ck))
+		return ck;
+
+	if (unlikely(!ck)) {
+		ck = bkey_cached_reuse(bc);
+		if (unlikely(!ck)) {
+			bch_err(c, "error allocating memory for key cache item, btree %s",
+				bch2_btree_id_str(path->btree_id));
+			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+		}
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+	}
+
+	ck->c.level		= 0;
+	ck->c.btree_id		= path->btree_id;
+	ck->key.btree_id	= path->btree_id;
+	ck->key.pos		= path->pos;
+	ck->valid		= false;
+	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
+
+	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
+					  &ck->hash,
+					  bch2_btree_key_cache_params))) {
+		/* We raced with another fill: */
+
+		if (likely(was_new)) {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+			kfree(ck);
+		} else {
+			bkey_cached_free_fast(bc, ck);
+		}
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+		return NULL;
+	}
+
+	atomic_long_inc(&bc->nr_keys);
+
+	six_unlock_write(&ck->c.lock);
+
+	return ck;
+}
+
+static int btree_key_cache_fill(struct btree_trans *trans,
+				struct btree_path *ck_path,
+				struct bkey_cached *ck)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	unsigned new_u64s = 0;
+	struct bkey_i *new_k = NULL;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
+			       BTREE_ITER_KEY_CACHE_FILL|
+			       BTREE_ITER_CACHED_NOFILL);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+		goto err;
+	}
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at
+	 * most 7 bytes (it won't be used):
+	 */
+	new_u64s = k.k->u64s + 1;
+
+	/*
+	 * Allocate some extra space so that the transaction commit path is less
+	 * likely to have to reallocate, since that requires a transaction
+	 * restart:
+	 */
+	new_u64s = min(256U, (new_u64s * 3) / 2);
+
+	if (new_u64s > ck->u64s) {
+		new_u64s = roundup_pow_of_two(new_u64s);
+		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
+		if (!new_k) {
+			bch2_trans_unlock(trans);
+
+			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+			if (!new_k) {
+				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+					bch2_btree_id_str(ck->key.btree_id), new_u64s);
+				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+				goto err;
+			}
+
+			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+				kfree(new_k);
+				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
+				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
+				goto err;
+			}
+
+			ret = bch2_trans_relock(trans);
+			if (ret) {
+				kfree(new_k);
+				goto err;
+			}
+		}
+	}
+
+	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+	if (ret) {
+		kfree(new_k);
+		goto err;
+	}
+
+	if (new_k) {
+		kfree(ck->k);
+		ck->u64s = new_u64s;
+		ck->k = new_k;
+	}
+
+	bkey_reassemble(ck->k, k);
+	ck->valid = true;
+	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
+
+	/* We're not likely to need this iterator again: */
+	set_btree_iter_dontneed(&iter);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline int
+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
+					 unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	BUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck) {
+		ck = btree_key_cache_create(trans, path);
+		ret = PTR_ERR_OR_ZERO(ck);
+		if (ret)
+			goto err;
+		if (!ck)
+			goto retry;
+
+		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+		path->locks_want = 1;
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		BUG_ON(ret);
+
+		if (ck->key.btree_id != path->btree_id ||
+		    !bpos_eq(ck->key.pos, path->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
+	}
+
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+fill:
+	path->uptodate = BTREE_ITER_UPTODATE;
+
+	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+		/*
+		 * Using the underscore version because we haven't set
+		 * path->uptodate yet:
+		 */
+		if (!path->locks_want &&
+		    !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
+			trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
+			goto err;
+		}
+
+		ret = btree_key_cache_fill(trans, path, ck);
+		if (ret)
+			goto err;
+
+		ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
+		if (ret)
+			goto err;
+
+		path->uptodate = BTREE_ITER_UPTODATE;
+	}
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+	BUG_ON(path->uptodate);
+
+	return ret;
+err:
+	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		btree_node_unlock(trans, path, 0);
+		path->l[0].b = ERR_PTR(ret);
+	}
+	return ret;
+}
+
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+				    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck;
+	int ret = 0;
+
+	EBUG_ON(path->level);
+
+	path->l[1].b = NULL;
+
+	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
+		ck = (void *) path->l[0].b;
+		goto fill;
+	}
+retry:
+	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
+	if (!ck) {
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+	} else {
+		enum six_lock_type lock_want = __btree_lock_want(path, 0);
+
+		ret = btree_node_lock(trans, path, (void *) ck, 0,
+				      lock_want, _THIS_IP_);
+		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret)
+			return ret;
+
+		if (ck->key.btree_id != path->btree_id ||
+		    !bpos_eq(ck->key.pos, path->pos)) {
+			six_unlock_type(&ck->c.lock, lock_want);
+			goto retry;
+		}
+
+		mark_btree_node_locked(trans, path, 0,
+				       (enum btree_node_locked_type) lock_want);
+	}
+
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
+	path->l[0].b		= (void *) ck;
+fill:
+	if (!ck->valid)
+		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
+
+	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
+	path->uptodate = BTREE_ITER_UPTODATE;
+	EBUG_ON(!ck->valid);
+	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
+
+	return ret;
+}
+
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
+				     struct bkey_cached_key key,
+				     u64 journal_seq,
+				     unsigned commit_flags,
+				     bool evict)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_iter c_iter, b_iter;
+	struct bkey_cached *ck = NULL;
+	int ret;
+
+	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+			     BTREE_ITER_SLOTS|
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
+	ret = bch2_btree_iter_traverse(&c_iter);
+	if (ret)
+		goto out;
+
+	ck = (void *) c_iter.path->l[0].b;
+	if (!ck)
+		goto out;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		if (evict)
+			goto evict;
+		goto out;
+	}
+
+	BUG_ON(!ck->valid);
+
+	if (journal_seq && ck->journal.seq != journal_seq)
+		goto out;
+
+	/*
+	 * Since journal reclaim depends on us making progress here, and the
+	 * allocator/copygc depend on journal reclaim making progress, we need
+	 * to be using alloc reserves:
+	 */
+	ret   = bch2_btree_iter_traverse(&b_iter) ?:
+		bch2_trans_update(trans, &b_iter, ck->k,
+				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				  BTREE_TRIGGER_NORUN) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOCHECK_RW|
+				  BTREE_INSERT_NOFAIL|
+				  (ck->journal.seq == journal_last_seq(j)
+				   ? BCH_WATERMARK_reclaim
+				   : 0)|
+				  commit_flags);
+
+	bch2_fs_fatal_err_on(ret &&
+			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
+			     !bch2_journal_error(j), c,
+			     "error flushing key cache: %s", bch2_err_str(ret));
+	if (ret)
+		goto out;
+
+	bch2_journal_pin_drop(j, &ck->journal);
+
+	BUG_ON(!btree_node_locked(c_iter.path, 0));
+
+	if (!evict) {
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+	} else {
+		struct btree_path *path2;
+evict:
+		trans_for_each_path(trans, path2)
+			if (path2 != c_iter.path)
+				__bch2_btree_path_unlock(trans, path2);
+
+		bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+			atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		}
+
+		mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+		bkey_cached_evict(&c->btree_key_cache, ck);
+		bkey_cached_free_fast(&c->btree_key_cache, ck);
+	}
+out:
+	bch2_trans_iter_exit(trans, &b_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	return ret;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+				struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_cached *ck =
+		container_of(pin, struct bkey_cached, journal);
+	struct bkey_cached_key key;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	int ret = 0;
+
+	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
+	key = ck->key;
+
+	if (ck->journal.seq != seq ||
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+
+	if (ck->seq != seq) {
+		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
+					bch2_btree_key_cache_journal_flush);
+		six_unlock_read(&ck->c.lock);
+		goto unlock;
+	}
+	six_unlock_read(&ck->c.lock);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		btree_key_cache_flush_pos(trans, key, seq,
+				BTREE_INSERT_JOURNAL_RECLAIM, false));
+unlock:
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+/*
+ * Flush and evict a key from the key cache:
+ */
+int bch2_btree_key_cache_flush(struct btree_trans *trans,
+			       enum btree_id id, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached_key key = { id, pos };
+
+	/* Fastpath - assume it won't be found: */
+	if (!bch2_btree_key_cache_find(c, id, pos))
+		return 0;
+
+	return btree_key_cache_flush_pos(trans, key, 0, 0, true);
+}
+
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
+				  unsigned flags,
+				  struct btree_insert_entry *insert_entry)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+	struct bkey_i *insert = insert_entry->k;
+	bool kick_reclaim = false;
+
+	BUG_ON(insert->k.u64s > ck->u64s);
+
+	bkey_copy(ck->k, insert);
+	ck->valid = true;
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+		if (bch2_nr_btree_keys_need_flush(c))
+			kick_reclaim = true;
+	}
+
+	/*
+	 * To minimize lock contention, we only add the journal pin here and
+	 * defer pin updates to the flush callback via ->seq. Be careful not to
+	 * update ->seq on nojournal commits because we don't want to update the
+	 * pin to a seq that doesn't include journal updates on disk. Otherwise
+	 * we risk losing the update after a crash.
+	 *
+	 * The only exception is if the pin is not active in the first place. We
+	 * have to add the pin because journal reclaim drives key cache
+	 * flushing. The flush callback will not proceed unless ->seq matches
+	 * the latest pin, so make sure it starts with a consistent value.
+	 */
+	if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+	    !journal_pin_active(&ck->journal)) {
+		ck->seq = trans->journal_res.seq;
+	}
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+			     &ck->journal, bch2_btree_key_cache_journal_flush);
+
+	if (kick_reclaim)
+		journal_reclaim_kick(&c->journal);
+	return true;
+}
+
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+			       struct btree_path *path)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+
+	BUG_ON(!ck->valid);
+
+	/*
+	 * We just did an update to the btree, bypassing the key cache: the key
+	 * cache key is now stale and must be dropped, even if dirty:
+	 */
+	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+		atomic_long_dec(&c->btree_key_cache.nr_dirty);
+		bch2_journal_pin_drop(&c->journal, &ck->journal);
+	}
+
+	ck->valid = false;
+}
+
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	struct bucket_table *tbl;
+	struct bkey_cached *ck, *t;
+	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+	unsigned start, flags;
+	int srcu_idx;
+
+	mutex_lock(&bc->lock);
+	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+	flags = memalloc_nofs_save();
+
+	/*
+	 * Newest freed entries are at the end of the list - once we hit one
+	 * that's too new to be freed, we can bail out:
+	 */
+	scanned += bc->nr_freed_nonpcpu;
+
+	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
+
+		list_del(&ck->list);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+		atomic_long_dec(&bc->nr_freed);
+		freed++;
+		bc->nr_freed_nonpcpu--;
+	}
+
+	if (scanned >= nr)
+		goto out;
+
+	scanned += bc->nr_freed_pcpu;
+
+	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+						 ck->btree_trans_barrier_seq))
+			break;
+
+		list_del(&ck->list);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+		atomic_long_dec(&bc->nr_freed);
+		freed++;
+		bc->nr_freed_pcpu--;
+	}
+
+	if (scanned >= nr)
+		goto out;
+
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+	if (bc->shrink_iter >= tbl->size)
+		bc->shrink_iter = 0;
+	start = bc->shrink_iter;
+
+	do {
+		struct rhash_head *pos, *next;
+
+		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+		while (!rht_is_a_nulls(pos)) {
+			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+			ck = container_of(pos, struct bkey_cached, hash);
+
+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+				goto next;
+
+			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+			else if (bkey_cached_lock_for_evict(ck)) {
+				bkey_cached_evict(bc, ck);
+				bkey_cached_free(bc, ck);
+			}
+
+			scanned++;
+			if (scanned >= nr)
+				break;
+next:
+			pos = next;
+		}
+
+		bc->shrink_iter++;
+		if (bc->shrink_iter >= tbl->size)
+			bc->shrink_iter = 0;
+	} while (scanned < nr && bc->shrink_iter != start);
+
+	rcu_read_unlock();
+out:
+	memalloc_nofs_restore(flags);
+	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+	mutex_unlock(&bc->lock);
+
+	return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
+	long nr = atomic_long_read(&bc->nr_keys) -
+		atomic_long_read(&bc->nr_dirty);
+
+	return max(0L, nr);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct bucket_table *tbl;
+	struct bkey_cached *ck, *n;
+	struct rhash_head *pos;
+	LIST_HEAD(items);
+	unsigned i;
+#ifdef __KERNEL__
+	int cpu;
+#endif
+
+	shrinker_free(bc->shrink);
+
+	mutex_lock(&bc->lock);
+
+	/*
+	 * The loop is needed to guard against racing with rehash:
+	 */
+	while (atomic_long_read(&bc->nr_keys)) {
+		rcu_read_lock();
+		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+		if (tbl)
+			for (i = 0; i < tbl->size; i++)
+				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+					bkey_cached_evict(bc, ck);
+					list_add(&ck->list, &items);
+				}
+		rcu_read_unlock();
+	}
+
+#ifdef __KERNEL__
+	for_each_possible_cpu(cpu) {
+		struct btree_key_cache_freelist *f =
+			per_cpu_ptr(bc->pcpu_freed, cpu);
+
+		for (i = 0; i < f->nr; i++) {
+			ck = f->objs[i];
+			list_add(&ck->list, &items);
+		}
+	}
+#endif
+
+	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
+	list_splice(&bc->freed_pcpu,	&items);
+	list_splice(&bc->freed_nonpcpu,	&items);
+
+	mutex_unlock(&bc->lock);
+
+	list_for_each_entry_safe(ck, n, &items, list) {
+		cond_resched();
+
+		list_del(&ck->list);
+		kfree(ck->k);
+		six_lock_exit(&ck->c.lock);
+		kmem_cache_free(bch2_key_cache, ck);
+	}
+
+	if (atomic_long_read(&bc->nr_dirty) &&
+	    !bch2_journal_error(&c->journal) &&
+	    test_bit(BCH_FS_WAS_RW, &c->flags))
+		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_dirty));
+
+	if (atomic_long_read(&bc->nr_keys))
+		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
+		      atomic_long_read(&bc->nr_keys));
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+
+	free_percpu(bc->pcpu_freed);
+}
+
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
+{
+	mutex_init(&c->lock);
+	INIT_LIST_HEAD(&c->freed_pcpu);
+	INIT_LIST_HEAD(&c->freed_nonpcpu);
+}
+
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
+{
+	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct shrinker *shrink;
+
+#ifdef __KERNEL__
+	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
+	if (!bc->pcpu_freed)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+#endif
+
+	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	bc->table_init_done = true;
+
+	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+	if (!shrink)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	bc->shrink = shrink;
+	shrink->seeks		= 0;
+	shrink->count_objects	= bch2_btree_key_cache_count;
+	shrink->scan_objects	= bch2_btree_key_cache_scan;
+	shrink->private_data	= c;
+	shrinker_register(shrink);
+	return 0;
+}
+
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
+{
+	prt_printf(out, "nr_freed:\t%lu",	atomic_long_read(&c->nr_freed));
+	prt_newline(out);
+	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
+	prt_newline(out);
+	prt_printf(out, "nr_dirty:\t%lu",	atomic_long_read(&c->nr_dirty));
+	prt_newline(out);
+}
+
+void bch2_btree_key_cache_exit(void)
+{
+	kmem_cache_destroy(bch2_key_cache);
+}
+
+int __init bch2_btree_key_cache_init(void)
+{
+	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
+	if (!bch2_key_cache)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
new file mode 100644
index 000000000000..be3acde2caa0
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
+#define _BCACHEFS_BTREE_KEY_CACHE_H
+
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 1024 + nr_keys  / 2;
+
+	return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+	return nr_dirty > max_dirty;
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+				struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+				    unsigned);
+
+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
+			struct btree_insert_entry *);
+int bch2_btree_key_cache_flush(struct btree_trans *,
+			       enum btree_id, struct bpos);
+void bch2_btree_key_cache_drop(struct btree_trans *,
+			       struct btree_path *);
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
+
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644
index 000000000000..290e4e57df5b
--- /dev/null
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+	struct bkey_cached	*objs[16];
+	unsigned		nr;
+};
+
+struct btree_key_cache {
+	struct mutex		lock;
+	struct rhashtable	table;
+	bool			table_init_done;
+
+	struct list_head	freed_pcpu;
+	size_t			nr_freed_pcpu;
+	struct list_head	freed_nonpcpu;
+	size_t			nr_freed_nonpcpu;
+
+	struct shrinker		*shrink;
+	unsigned		shrink_iter;
+	struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+	atomic_long_t		nr_freed;
+	atomic_long_t		nr_keys;
+	atomic_long_t		nr_dirty;
+};
+
+struct bkey_cached_key {
+	u32			btree_id;
+	struct bpos		pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
new file mode 100644
index 000000000000..3d48834d091f
--- /dev/null
+++ b/fs/bcachefs/btree_locking.c
@@ -0,0 +1,817 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+static struct lock_class_key bch2_btree_node_lock_key;
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+			  enum six_lock_init_flags flags)
+{
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+	lockdep_set_novalidate_class(&b->lock);
+}
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void)
+{
+#if 0
+	//Re-enable when lock_class_is_held() is merged:
+	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+#endif
+}
+#endif
+
+/* Btree node locking: */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+						  struct btree_path *skip,
+						  struct btree_bkey_cached_common *b,
+						  unsigned level)
+{
+	struct btree_path *path;
+	struct six_lock_count ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	if (IS_ERR_OR_NULL(b))
+		return ret;
+
+	trans_for_each_path(trans, path)
+		if (path != skip && &path->l[level].b->c == b) {
+			int t = btree_node_locked_type(path, level);
+
+			if (t != BTREE_NODE_UNLOCKED)
+				ret.n[t]++;
+		}
+
+	return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+			struct btree_path *path, struct btree *b)
+{
+	bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+	struct btree_trans		*trans;
+	struct btree_bkey_cached_common	*node_want;
+	enum six_lock_type		lock_want;
+
+	/* for iterating over held locks :*/
+	u8				path_idx;
+	u8				level;
+	u64				lock_start_time;
+};
+
+struct lock_graph {
+	struct trans_waiting_for_lock	g[8];
+	unsigned			nr;
+};
+
+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	prt_printf(out, "Found lock cycle (%u entries):", g->nr);
+	prt_newline(out);
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		bch2_btree_trans_to_text(out, i->trans);
+}
+
+static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i != g->g + g->nr; i++) {
+		if (i != g->g)
+			prt_str(out, "<- ");
+		prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+	}
+	prt_newline(out);
+}
+
+static void lock_graph_up(struct lock_graph *g)
+{
+	closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static noinline void lock_graph_pop_all(struct lock_graph *g)
+{
+	while (g->nr)
+		lock_graph_up(g);
+}
+
+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	g->g[g->nr++] = (struct trans_waiting_for_lock) {
+		.trans		= trans,
+		.node_want	= trans->locking,
+		.lock_want	= trans->locking_wait.lock_want,
+	};
+}
+
+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
+{
+	closure_get(&trans->ref);
+	__lock_graph_down(g, trans);
+}
+
+static bool lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g + 1; i < g->g + g->nr; i++)
+		if (i->trans->locking != i->node_want ||
+		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+			while (g->g + g->nr > i)
+				lock_graph_up(g);
+			return true;
+		}
+
+	return false;
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+	if (i == g->g) {
+		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+	} else {
+		i->trans->lock_must_abort = true;
+		wake_up_process(i->trans->locking_wait.task);
+		return 0;
+	}
+}
+
+static int btree_trans_abort_preference(struct btree_trans *trans)
+{
+	if (trans->lock_may_not_fail)
+		return 0;
+	if (trans->locking_wait.lock_want == SIX_LOCK_write)
+		return 1;
+	if (!trans->in_traverse_all)
+		return 2;
+	return 3;
+}
+
+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
+{
+	struct trans_waiting_for_lock *i, *abort = NULL;
+	unsigned best = 0, pref;
+	int ret;
+
+	if (lock_graph_remove_non_waiters(g))
+		return 0;
+
+	/* Only checking, for debugfs: */
+	if (cycle) {
+		print_cycle(cycle, g);
+		ret = -1;
+		goto out;
+	}
+
+	for (i = g->g; i < g->g + g->nr; i++) {
+		pref = btree_trans_abort_preference(i->trans);
+		if (pref > best) {
+			abort = i;
+			best = pref;
+		}
+	}
+
+	if (unlikely(!best)) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+		for (i = g->g; i < g->g + g->nr; i++) {
+			struct btree_trans *trans = i->trans;
+
+			bch2_btree_trans_to_text(&buf, trans);
+
+			prt_printf(&buf, "backtrace:");
+			prt_newline(&buf);
+			printbuf_indent_add(&buf, 2);
+			bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+			printbuf_indent_sub(&buf, 2);
+			prt_newline(&buf);
+		}
+
+		bch2_print_string_as_lines(KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+		BUG();
+	}
+
+	ret = abort_lock(g, abort);
+out:
+	if (ret)
+		while (g->nr)
+			lock_graph_up(g);
+	return ret;
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
+			      struct printbuf *cycle)
+{
+	struct btree_trans *orig_trans = g->g->trans;
+	struct trans_waiting_for_lock *i;
+
+	for (i = g->g; i < g->g + g->nr; i++)
+		if (i->trans == trans) {
+			closure_put(&trans->ref);
+			return break_cycle(g, cycle);
+		}
+
+	if (g->nr == ARRAY_SIZE(g->g)) {
+		closure_put(&trans->ref);
+
+		if (orig_trans->lock_may_not_fail)
+			return 0;
+
+		while (g->nr)
+			lock_graph_up(g);
+
+		if (cycle)
+			return 0;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+	}
+
+	__lock_graph_down(g, trans);
+	return 0;
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+	return t1 + t2 > 1;
+}
+
+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
+{
+	struct lock_graph g;
+	struct trans_waiting_for_lock *top;
+	struct btree_bkey_cached_common *b;
+	struct btree_path *path;
+	unsigned path_idx;
+	int ret;
+
+	if (trans->lock_must_abort) {
+		if (cycle)
+			return -1;
+
+		trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+	}
+
+	g.nr = 0;
+	lock_graph_down(&g, trans);
+next:
+	if (!g.nr)
+		return 0;
+
+	top = &g.g[g.nr - 1];
+
+	trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+		if (!path->nodes_locked)
+			continue;
+
+		if (path_idx != top->path_idx) {
+			top->path_idx		= path_idx;
+			top->level		= 0;
+			top->lock_start_time	= 0;
+		}
+
+		for (;
+		     top->level < BTREE_MAX_DEPTH;
+		     top->level++, top->lock_start_time = 0) {
+			int lock_held = btree_node_locked_type(path, top->level);
+
+			if (lock_held == BTREE_NODE_UNLOCKED)
+				continue;
+
+			b = &READ_ONCE(path->l[top->level].b)->c;
+
+			if (IS_ERR_OR_NULL(b)) {
+				/*
+				 * If we get here, it means we raced with the
+				 * other thread updating its btree_path
+				 * structures - which means it can't be blocked
+				 * waiting on a lock:
+				 */
+				if (!lock_graph_remove_non_waiters(&g)) {
+					/*
+					 * If lock_graph_remove_non_waiters()
+					 * didn't do anything, it must be
+					 * because we're being called by debugfs
+					 * checking for lock cycles, which
+					 * invokes us on btree_transactions that
+					 * aren't actually waiting on anything.
+					 * Just bail out:
+					 */
+					lock_graph_pop_all(&g);
+				}
+
+				goto next;
+			}
+
+			if (list_empty_careful(&b->lock.wait_list))
+				continue;
+
+			raw_spin_lock(&b->lock.wait_lock);
+			list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+				BUG_ON(b != trans->locking);
+
+				if (top->lock_start_time &&
+				    time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+					continue;
+
+				top->lock_start_time = trans->locking_wait.start_time;
+
+				/* Don't check for self deadlock: */
+				if (trans == top->trans ||
+				    !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+					continue;
+
+				closure_get(&trans->ref);
+				raw_spin_unlock(&b->lock.wait_lock);
+
+				ret = lock_graph_descend(&g, trans, cycle);
+				if (ret)
+					return ret;
+				goto next;
+
+			}
+			raw_spin_unlock(&b->lock.wait_lock);
+		}
+	}
+
+	if (g.nr > 1 && cycle)
+		print_chain(cycle, &g);
+	lock_graph_up(&g);
+	goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+	struct btree_trans *trans = p;
+
+	return bch2_check_for_deadlock(trans, NULL);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path,
+				 struct btree_bkey_cached_common *b,
+				 bool lock_may_not_fail)
+{
+	int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+	int ret;
+
+	/*
+	 * Must drop our read locks before calling six_lock_write() -
+	 * six_unlock() won't do wakeups until the reader count
+	 * goes to 0, and it's safe because we have the node intent
+	 * locked:
+	 */
+	six_lock_readers_add(&b->lock, -readers);
+	ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write,
+				       lock_may_not_fail, _RET_IP_);
+	six_lock_readers_add(&b->lock, readers);
+
+	if (ret)
+		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
+
+	return ret;
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree_bkey_cached_common *b)
+{
+	struct btree_path *linked;
+	unsigned i;
+	int ret;
+
+	/*
+	 * XXX BIG FAT NOTICE
+	 *
+	 * Drop all read locks before taking a write lock:
+	 *
+	 * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+	 * hack - but by dropping read locks first, this should never fail, and
+	 * we only use this in code paths where whatever read locks we've
+	 * already taken are no longer needed:
+	 */
+
+	trans_for_each_path(trans, linked) {
+		if (!linked->nodes_locked)
+			continue;
+
+		for (i = 0; i < BTREE_MAX_DEPTH; i++)
+			if (btree_node_read_locked(linked, i)) {
+				btree_node_unlock(trans, linked, i);
+				btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+			}
+	}
+
+	ret = __btree_node_lock_write(trans, path, b, true);
+	BUG_ON(ret);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+					struct btree_path *path,
+					bool upgrade,
+					struct get_locks_fail *f)
+{
+	unsigned l = path->level;
+	int fail_idx = -1;
+
+	do {
+		if (!btree_path_node(path, l))
+			break;
+
+		if (!(upgrade
+		      ? bch2_btree_node_upgrade(trans, path, l)
+		      : bch2_btree_node_relock(trans, path, l))) {
+			fail_idx	= l;
+
+			if (f) {
+				f->l	= l;
+				f->b	= path->l[l].b;
+			}
+		}
+
+		l++;
+	} while (l < path->locks_want);
+
+	/*
+	 * When we fail to get a lock, we have to ensure that any child nodes
+	 * can't be relocked so bch2_btree_path_traverse has to walk back up to
+	 * the node that we failed to relock:
+	 */
+	if (fail_idx >= 0) {
+		__bch2_btree_path_unlock(trans, path);
+		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+		do {
+			path->l[fail_idx].b = upgrade
+				? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+				: ERR_PTR(-BCH_ERR_no_btree_node_relock);
+			--fail_idx;
+		} while (fail_idx >= 0);
+	}
+
+	if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+		path->uptodate = BTREE_ITER_UPTODATE;
+
+	bch2_trans_verify_locks(trans);
+
+	return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+			      struct btree_path *path, unsigned level,
+			      bool trace)
+{
+	struct btree *b = btree_path_node(path, level);
+	int want = __btree_lock_want(path, level);
+
+	if (race_fault())
+		goto fail;
+
+	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+	    (btree_node_lock_seq_matches(path, b, level) &&
+	     btree_node_lock_increment(trans, &b->c, level, want))) {
+		mark_btree_node_locked(trans, path, level, want);
+		return true;
+	}
+fail:
+	if (trace && !trans->notrace_relock_fail)
+		trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+	return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+			     struct btree_path *path, unsigned level)
+{
+	struct btree *b = path->l[level].b;
+	struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level);
+
+	if (!is_btree_node(path, level))
+		return false;
+
+	switch (btree_lock_want(path, level)) {
+	case BTREE_NODE_UNLOCKED:
+		BUG_ON(btree_node_locked(path, level));
+		return true;
+	case BTREE_NODE_READ_LOCKED:
+		BUG_ON(btree_node_intent_locked(path, level));
+		return bch2_btree_node_relock(trans, path, level);
+	case BTREE_NODE_INTENT_LOCKED:
+		break;
+	case BTREE_NODE_WRITE_LOCKED:
+		BUG();
+	}
+
+	if (btree_node_intent_locked(path, level))
+		return true;
+
+	if (race_fault())
+		return false;
+
+	if (btree_node_locked(path, level)) {
+		bool ret;
+
+		six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]);
+		ret = six_lock_tryupgrade(&b->c.lock);
+		six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]);
+
+		if (ret)
+			goto success;
+	} else {
+		if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+			goto success;
+	}
+
+	/*
+	 * Do we already have an intent lock via another path? If so, just bump
+	 * lock count:
+	 */
+	if (btree_node_lock_seq_matches(path, b, level) &&
+	    btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+		btree_node_unlock(trans, path, level);
+		goto success;
+	}
+
+	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+	return false;
+success:
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+	return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+				  struct btree_path *path)
+{
+	unsigned l;
+
+	for (l = path->level;
+	     l < path->locks_want && btree_path_node(path, l);
+	     l++) {
+		if (!bch2_btree_node_relock(trans, path, l)) {
+			__bch2_btree_path_unlock(trans, path);
+			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+			trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+		}
+	}
+
+	return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	struct get_locks_fail f;
+
+	return btree_path_get_locks(trans, path, false, &f);
+}
+
+int __bch2_btree_path_relock(struct btree_trans *trans,
+			struct btree_path *path, unsigned long trace_ip)
+{
+	if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+		trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+	}
+
+	return 0;
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want,
+			       struct get_locks_fail *f)
+{
+	EBUG_ON(path->locks_want >= new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	return btree_path_get_locks(trans, path, true, f);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+			       struct btree_path *path,
+			       unsigned new_locks_want,
+			       struct get_locks_fail *f)
+{
+	struct btree_path *linked;
+
+	if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
+		return true;
+
+	/*
+	 * XXX: this is ugly - we'd prefer to not be mucking with other
+	 * iterators in the btree_trans here.
+	 *
+	 * On failure to upgrade the iterator, setting iter->locks_want and
+	 * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+	 * get the locks we want on transaction restart.
+	 *
+	 * But if this iterator was a clone, on transaction restart what we did
+	 * to this iterator isn't going to be preserved.
+	 *
+	 * Possibly we could add an iterator field for the parent iterator when
+	 * an iterator is a copy - for now, we'll just upgrade any other
+	 * iterators with the same btree id.
+	 *
+	 * The code below used to be needed to ensure ancestor nodes get locked
+	 * before interior nodes - now that's handled by
+	 * bch2_btree_path_traverse_all().
+	 */
+	if (!path->cached && !trans->in_traverse_all)
+		trans_for_each_path(trans, linked)
+			if (linked != path &&
+			    linked->cached == path->cached &&
+			    linked->btree_id == path->btree_id &&
+			    linked->locks_want < new_locks_want) {
+				linked->locks_want = new_locks_want;
+				btree_path_get_locks(trans, linked, true, NULL);
+			}
+
+	return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+				 struct btree_path *path,
+				 unsigned new_locks_want)
+{
+	unsigned l;
+
+	if (trans->restarted)
+		return;
+
+	EBUG_ON(path->locks_want < new_locks_want);
+
+	path->locks_want = new_locks_want;
+
+	while (path->nodes_locked &&
+	       (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+		if (l > path->level) {
+			btree_node_unlock(trans, path, l);
+		} else {
+			if (btree_node_intent_locked(path, l)) {
+				six_lock_downgrade(&path->l[l].b->c.lock);
+				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
+			}
+			break;
+		}
+	}
+
+	bch2_btree_path_verify_locks(path);
+
+	path->downgrade_seq++;
+	trace_path_downgrade(trans, _RET_IP_, path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (trans->restarted)
+		return;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+			trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		}
+	return 0;
+}
+
+int bch2_trans_relock_notrace(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	if (unlikely(trans->restarted))
+		return -((int) trans->restarted);
+
+	trans_for_each_path(trans, path)
+		if (path->should_be_locked &&
+		    !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+		}
+	return 0;
+}
+
+void bch2_trans_unlock_noassert(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		__bch2_btree_path_unlock(trans, path);
+}
+
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+	bch2_trans_srcu_unlock(trans);
+}
+
+bool bch2_trans_locked(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (path->nodes_locked)
+			return true;
+	return false;
+}
+
+int __bch2_trans_mutex_lock(struct btree_trans *trans,
+			    struct mutex *lock)
+{
+	int ret = drop_locks_do(trans, (mutex_lock(lock), 0));
+
+	if (ret)
+		mutex_unlock(lock);
+	return ret;
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+	unsigned l;
+
+	if (!path->nodes_locked) {
+		BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+		       btree_path_node(path, path->level));
+		return;
+	}
+
+	for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+		int want = btree_lock_want(path, l);
+		int have = btree_node_locked_type(path, l);
+
+		BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+		BUG_ON(is_btree_node(path, l) &&
+		       (want == BTREE_NODE_UNLOCKED ||
+			have != BTREE_NODE_WRITE_LOCKED) &&
+		       want != have);
+	}
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
new file mode 100644
index 000000000000..11b0a2c8cd69
--- /dev/null
+++ b/fs/bcachefs/btree_locking.h
@@ -0,0 +1,433 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "six.h"
+
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+
+#ifdef CONFIG_LOCKDEP
+void bch2_assert_btree_nodes_not_locked(void);
+#else
+static inline void bch2_assert_btree_nodes_not_locked(void) {}
+#endif
+
+void bch2_trans_unlock_noassert(struct btree_trans *);
+
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
+{
+	return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
+}
+
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+	return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+		? &trans->c->btree_transaction_stats[trans->fn_idx]
+		: NULL;
+}
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+	BTREE_NODE_WRITE_LOCKED		= SIX_LOCK_write,
+};
+
+static inline int btree_node_locked_type(struct btree_path *path,
+					 unsigned level)
+{
+	return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
+}
+
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
+}
+
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
+{
+	return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+{
+	return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
+}
+
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+						  unsigned level,
+						  enum btree_node_locked_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	path->nodes_locked &= ~(3U << (level << 1));
+	path->nodes_locked |= (type + 1) << (level << 1);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+					    unsigned level)
+{
+	EBUG_ON(btree_node_write_locked(path, level));
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
+}
+
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned level,
+					  enum btree_node_locked_type type)
+{
+	mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	path->l[level].lock_taken_time = local_clock();
+#endif
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
+{
+	return level < path->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_path *path, int level)
+{
+	if (level < path->level)
+		return BTREE_NODE_UNLOCKED;
+	if (level < path->locks_want)
+		return BTREE_NODE_INTENT_LOCKED;
+	if (level == path->level)
+		return BTREE_NODE_READ_LOCKED;
+	return BTREE_NODE_UNLOCKED;
+}
+
+static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
+					      struct btree_path *path, unsigned level)
+{
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+	struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+	if (s)
+		__bch2_time_stats_update(&s->lock_hold_times,
+					 path->l[level].lock_taken_time,
+					 local_clock());
+#endif
+}
+
+/* unlock: */
+
+static inline void btree_node_unlock(struct btree_trans *trans,
+				     struct btree_path *path, unsigned level)
+{
+	int lock_type = btree_node_locked_type(path, level);
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (lock_type != BTREE_NODE_UNLOCKED) {
+		six_unlock_type(&path->l[level].b->c.lock, lock_type);
+		btree_trans_lock_hold_time_update(trans, path, level);
+	}
+	mark_btree_node_unlocked(path, level);
+}
+
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+	return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+	return __fls(path->nodes_locked) >> 1;
+}
+
+static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
+					    struct btree_path *path)
+{
+	btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+
+	while (path->nodes_locked)
+		btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
+}
+
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+				     struct btree *b)
+{
+	struct btree_path *linked;
+
+	EBUG_ON(path->l[b->c.level].b != b);
+	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
+	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+
+	trans_for_each_path_with_node(trans, b, linked)
+		linked->l[b->c.level].lock_seq++;
+
+	six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+			struct btree_path *, struct btree *);
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
+/* lock: */
+
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type,
+					 bool lock_may_not_fail,
+					 unsigned long ip)
+{
+	int ret;
+
+	trans->lock_may_not_fail = lock_may_not_fail;
+	trans->lock_must_abort	= false;
+	trans->locking		= b;
+
+	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				 bch2_six_check_for_deadlock, trans, ip);
+	WRITE_ONCE(trans->locking, NULL);
+	WRITE_ONCE(trans->locking_wait.start_time, 0);
+	return ret;
+}
+
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+		       struct btree_bkey_cached_common *b,
+		       enum six_lock_type type,
+		       unsigned long ip)
+{
+	return __btree_node_lock_nopath(trans, b, type, false, ip);
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+					 struct btree_bkey_cached_common *b,
+					 enum six_lock_type type)
+{
+	int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_);
+
+	BUG_ON(ret);
+}
+
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
+					     struct btree_bkey_cached_common *b,
+					     unsigned level,
+					     enum btree_node_locked_type want)
+{
+	struct btree_path *path;
+
+	trans_for_each_path(trans, path)
+		if (&path->l[level].b->c == b &&
+		    btree_node_locked_type(path, level) >= want) {
+			six_lock_increment(&b->lock, (enum six_lock_type) want);
+			return true;
+		}
+
+	return false;
+}
+
+static inline int btree_node_lock(struct btree_trans *trans,
+			struct btree_path *path,
+			struct btree_bkey_cached_common *b,
+			unsigned level,
+			enum six_lock_type type,
+			unsigned long ip)
+{
+	int ret = 0;
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+
+	if (likely(six_trylock_type(&b->lock, type)) ||
+	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
+	    !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) {
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		path->l[b->level].lock_taken_time = local_clock();
+#endif
+	}
+
+	return ret;
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *,
+				 struct btree_bkey_cached_common *b, bool);
+
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+					  struct btree_path *path,
+					  struct btree_bkey_cached_common *b,
+					  bool lock_may_not_fail)
+{
+	EBUG_ON(&path->l[b->level].b->c != b);
+	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
+	EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+	/*
+	 * six locks are unfair, and read locks block while a thread wants a
+	 * write lock: thus, we need to tell the cycle detector we have a write
+	 * lock _before_ taking the lock:
+	 */
+	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
+
+	return likely(six_trylock_write(&b->lock))
+		? 0
+		: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+			   struct btree_path *path,
+			   struct btree_bkey_cached_common *b)
+{
+	return __btree_node_lock_write(trans, path, b, false);
+}
+
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+				       struct btree_path *,
+				       struct btree_bkey_cached_common *);
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+				      struct btree_path *, unsigned long);
+int __bch2_btree_path_relock(struct btree_trans *,
+			     struct btree_path *, unsigned long);
+
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+				struct btree_path *path, unsigned long trace_ip)
+{
+	return btree_node_locked(path, path->level)
+		? 0
+		: __bch2_btree_path_relock(trans, path, trace_ip);
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace);
+
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+					  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, true));
+}
+
+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
+						  struct btree_path *path, unsigned level)
+{
+	EBUG_ON(btree_node_locked(path, level) &&
+		!btree_node_write_locked(path, level) &&
+		btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+
+	return likely(btree_node_locked(path, level)) ||
+		(!IS_ERR_OR_NULL(path->l[level].b) &&
+		 __bch2_btree_node_relock(trans, path, level, false));
+}
+
+/* upgrade */
+
+
+struct get_locks_fail {
+	unsigned	l;
+	struct btree	*b;
+};
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+			       struct btree_path *, unsigned,
+			       struct get_locks_fail *);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+			       struct btree_path *, unsigned,
+			       struct get_locks_fail *);
+
+static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
+					  struct btree_path *path,
+					  unsigned new_locks_want)
+{
+	struct get_locks_fail f;
+	unsigned old_locks_want = path->locks_want;
+
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	if (path->locks_want < new_locks_want
+	    ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
+	    : path->uptodate == BTREE_ITER_UPTODATE)
+		return 0;
+
+	trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
+			old_locks_want, new_locks_want, &f);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+}
+
+/* misc: */
+
+static inline void btree_path_set_should_be_locked(struct btree_path *path)
+{
+	EBUG_ON(!btree_node_locked(path, path->level));
+	EBUG_ON(path->uptodate);
+
+	path->should_be_locked = true;
+}
+
+static inline void __btree_path_set_level_up(struct btree_trans *trans,
+				      struct btree_path *path,
+				      unsigned l)
+{
+	btree_node_unlock(trans, path, l);
+	path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
+}
+
+static inline void btree_path_set_level_up(struct btree_trans *trans,
+				    struct btree_path *path)
+{
+	__btree_path_set_level_up(trans, path, path->level++);
+	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+/* debug */
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
+				struct btree_path *,
+				struct btree_bkey_cached_common *b,
+				unsigned);
+
+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
new file mode 100644
index 000000000000..12907beda98c
--- /dev/null
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -0,0 +1,1162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "snapshot.h"
+
+#include <linux/prefetch.h>
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bch_fs *c = trans->c;
+	struct bkey u;
+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+	if (unlikely(trans->journal_replay_not_finished)) {
+		struct bkey_i *j_k =
+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+		if (j_k)
+			k = bkey_i_to_s_c(j_k);
+	}
+
+	u = *k.k;
+	u.needs_whiteout = i->old_k.needs_whiteout;
+
+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+	BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+	return i->path->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i != trans->updates &&
+		insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+				     struct btree_insert_entry *i)
+{
+	return i + 1 < trans->updates + trans->nr_updates &&
+		insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+					   struct btree_path *path,
+					   struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+
+	if (unlikely(btree_node_just_written(b)) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(trans, b);
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+	while (--i >= trans->updates) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	EBUG_ON(trans->write_locked);
+
+	trans_for_each_update(trans, i) {
+		if (same_leaf_as_prev(trans, i))
+			continue;
+
+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+			return trans_lock_write_fail(trans, i);
+
+		if (!i->cached)
+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+	}
+
+	trans->write_locked = true;
+	return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+	if (likely(trans->write_locked)) {
+		struct btree_insert_entry *i;
+
+		trans_for_each_update(trans, i)
+			if (!same_leaf_as_prev(trans, i))
+				bch2_btree_node_unlock_write_inlined(trans, i->path,
+								     insert_l(i)->b);
+		trans->write_locked = false;
+	}
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	struct bkey_packed *k;
+	unsigned clobber_u64s = 0, new_u64s = 0;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+	EBUG_ON(insert->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, b));
+	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+		k = NULL;
+
+	/* @k is the key being overwritten/deleted, if any: */
+	EBUG_ON(k && bkey_deleted(k));
+
+	/* Deleting, but not found? nothing to do: */
+	if (bkey_deleted(&insert->k) && !k)
+		return false;
+
+	if (bkey_deleted(&insert->k)) {
+		/* Deleting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		if (k->needs_whiteout)
+			push_whiteout(trans->c, b, insert->k.p);
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			bch2_bset_delete(b, k, clobber_u64s);
+			goto fix_iter;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+
+		return true;
+	}
+
+	if (k) {
+		/* Overwriting: */
+		btree_account_key_drop(b, k);
+		k->type = KEY_TYPE_deleted;
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+		k->needs_whiteout = false;
+
+		if (k >= btree_bset_last(b)->start) {
+			clobber_u64s = k->u64s;
+			goto overwrite;
+		} else {
+			bch2_btree_path_fix_key_modified(trans, b, k);
+		}
+	}
+
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	new_u64s = k->u64s;
+fix_iter:
+	if (clobber_u64s != new_u64s)
+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+					 clobber_u64s, new_u64s);
+	return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+	struct btree_trans *trans = bch2_trans_get(c);
+	unsigned long old, new, v;
+	unsigned idx = w - b->writes;
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+	v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)) ||
+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+		    w->journal.seq != seq)
+			break;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_journal_reclaim;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	btree_node_write_if_need(c, b, SIX_LOCK_read);
+	six_unlock_read(&b->c.lock);
+
+	bch2_trans_put(trans);
+	return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+				       struct btree *b, u64 seq)
+{
+	struct btree_write *w = btree_current_write(b);
+
+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
+			     btree_node_write_idx(b) == 0
+			     ? bch2_btree_node_flush0
+			     : bch2_btree_node_flush1);
+}
+
+/**
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:		btree transaction object
+ * @path:		path pointing to @insert's pos
+ * @insert:		key to insert
+ * @journal_seq:	sequence number of journal reservation
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct bkey_i *insert,
+				       u64 journal_seq)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = path_l(path)->b;
+	struct bset_tree *t = bset_tree_last(b);
+	struct bset *i = bset(b, t);
+	int old_u64s = bset_u64s(t);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+					&path_l(path)->iter, insert)))
+		return;
+
+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+	bch2_btree_add_journal_pin(c, b, journal_seq);
+
+	if (unlikely(!btree_node_dirty(b))) {
+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+		set_btree_node_dirty_acct(c, b);
+	}
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) bset_u64s(t) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+					     struct btree_insert_entry *i)
+{
+	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
+	BUG_ON(i->cached	!= i->path->cached);
+	BUG_ON(i->level		!= i->path->level);
+	BUG_ON(i->btree_id	!= i->path->btree_id);
+	EBUG_ON(!i->level &&
+		btree_type_has_snapshots(i->btree_id) &&
+		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+		i->k->k.p.snapshot &&
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+						      unsigned flags)
+{
+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+				    trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s		4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct jset_entry *entry =
+		bch2_journal_add_entry(j, &trans->journal_res,
+				       BCH_JSET_ENTRY_log, 0, 0,
+				       JSET_ENTRY_LOG_U64s);
+	struct jset_entry_log *l =
+		container_of(entry, struct jset_entry_log, entry);
+
+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+				       struct btree *b, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_btree_node_insert_fits(c, b, u64s))
+		return -BCH_ERR_btree_insert_btree_node_full;
+
+	return 0;
+}
+
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+				     struct btree_path *path, unsigned new_u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct bkey_i *new_k;
+	int ret;
+
+	bch2_trans_unlock_write(trans);
+	bch2_trans_unlock(trans);
+
+	new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+	if (!new_k) {
+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+			bch2_btree_id_str(path->btree_id), new_u64s);
+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+	}
+
+	ret =   bch2_trans_relock(trans) ?:
+		bch2_trans_lock_write(trans);
+	if (unlikely(ret)) {
+		kfree(new_k);
+		return ret;
+	}
+
+	memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	kfree(ck->k);
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+				       struct btree_path *path, unsigned u64s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_cached *ck = (void *) path->l[0].b;
+	struct btree_insert_entry *i;
+	unsigned new_u64s;
+	struct bkey_i *new_k;
+
+	EBUG_ON(path->level);
+
+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c) &&
+	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+		return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+	/*
+	 * bch2_varint_decode can read past the end of the buffer by at most 7
+	 * bytes (it won't be used):
+	 */
+	u64s += 1;
+
+	if (u64s <= ck->u64s)
+		return 0;
+
+	new_u64s	= roundup_pow_of_two(u64s);
+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+	if (unlikely(!new_k))
+		return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
+
+	trans_for_each_update(trans, i)
+		if (i->old_v == &ck->k->v)
+			i->old_v = &new_k->v;
+
+	ck->u64s	= new_u64s;
+	ck->k		= new_k;
+	return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+			       struct btree_insert_entry *i,
+			       unsigned flags)
+{
+	struct bkey_s_c old = { &i->old_k, i->old_v };
+	struct bkey_i *new = i->k;
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+	int ret;
+
+	verify_update_old_key(trans, i);
+
+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
+		return 0;
+
+	if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
+		return 0;
+
+	if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				old, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+	} else {
+		struct bkey		_deleted = KEY(0, 0, 0);
+		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+		_deleted.p = i->path->pos;
+
+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
+				deleted, bkey_i_to_s_c(new),
+				BTREE_TRIGGER_INSERT|flags) ?:
+			bch2_mark_key(trans, i->btree_id, i->level,
+				old, deleted,
+				BTREE_TRIGGER_OVERWRITE|flags);
+	}
+
+	return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+				 bool overwrite)
+{
+	/*
+	 * Transactional triggers create new btree_insert_entries, so we can't
+	 * pass them a pointer to a btree_insert_entry, that memory is going to
+	 * move:
+	 */
+	struct bkey old_k = i->old_k;
+	struct bkey_s_c old = { &old_k, i->old_v };
+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+	verify_update_old_key(trans, i);
+
+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+		return 0;
+
+	if (!i->insert_trigger_run &&
+	    !i->overwrite_trigger_run &&
+	    old_ops->trans_trigger == new_ops->trans_trigger) {
+		i->overwrite_trigger_run = true;
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+					   BTREE_TRIGGER_INSERT|
+					   BTREE_TRIGGER_OVERWRITE|
+					   i->flags) ?: 1;
+	} else if (overwrite && !i->overwrite_trigger_run) {
+		i->overwrite_trigger_run = true;
+		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+	} else if (!overwrite && !i->insert_trigger_run) {
+		i->insert_trigger_run = true;
+		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+	} else {
+		return 0;
+	}
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+			      struct btree_insert_entry *btree_id_start)
+{
+	struct btree_insert_entry *i;
+	bool trans_trigger_run;
+	int ret, overwrite;
+
+	for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+		/*
+		 * Running triggers will append more updates to the list of updates as
+		 * we're walking it:
+		 */
+		do {
+			trans_trigger_run = false;
+
+			for (i = btree_id_start;
+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			     i++) {
+				if (i->btree_id != btree_id)
+					continue;
+
+				ret = run_one_trans_trigger(trans, i, overwrite);
+				if (ret < 0)
+					return ret;
+				if (ret)
+					trans_trigger_run = true;
+			}
+		} while (trans_trigger_run);
+	}
+
+	return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+	unsigned btree_id = 0;
+	int ret = 0;
+
+	/*
+	 *
+	 * For a given btree, this algorithm runs insert triggers before
+	 * overwrite triggers: this is so that when extents are being moved
+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+	 * they are re-added.
+	 */
+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+		if (btree_id == BTREE_ID_alloc)
+			continue;
+
+		while (btree_id_start < trans->updates + trans->nr_updates &&
+		       btree_id_start->btree_id < btree_id)
+			btree_id_start++;
+
+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
+		if (ret)
+			return ret;
+	}
+
+	trans_for_each_update(trans, i) {
+		if (i->btree_id > BTREE_ID_alloc)
+			break;
+		if (i->btree_id == BTREE_ID_alloc) {
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			if (ret)
+				return ret;
+			break;
+		}
+	}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+	return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		/*
+		 * XXX: synchronization of cached update triggers with gc
+		 * XXX: synchronization of interior node updates with gc
+		 */
+		BUG_ON(i->cached || i->level);
+
+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+			       struct btree_insert_entry **stopped_at,
+			       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+	struct btree_trans_commit_hook *h;
+	unsigned u64s = 0;
+	int ret;
+
+	if (race_fault()) {
+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+	}
+
+	/*
+	 * Check if the insert will fit in the leaf node with the write lock
+	 * held, otherwise another thread could write the node changing the
+	 * amount of space available:
+	 */
+
+	prefetch(&trans->c->journal.flags);
+
+	trans_for_each_update(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		u64s += i->k->k.u64s;
+		ret = !i->cached
+			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
+		if (ret) {
+			*stopped_at = i;
+			return ret;
+		}
+	}
+
+	if (trans->nr_wb_updates &&
+	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+		return -BCH_ERR_btree_insert_need_flush_buffer;
+
+	/*
+	 * Don't get journal reservation until after we know insert will
+	 * succeed:
+	 */
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		ret = bch2_trans_journal_res_get(trans,
+				(flags & BCH_WATERMARK_MASK)|
+				JOURNAL_RES_GET_NONBLOCK);
+		if (ret)
+			return ret;
+
+		if (unlikely(trans->journal_transaction_names))
+			journal_transaction_name(trans);
+	} else {
+		trans->journal_res.seq = c->journal.replay_journal_seq;
+	}
+
+	/*
+	 * Not allowed to fail after we've gotten our journal reservation - we
+	 * have to use it:
+	 */
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (bch2_journal_seq_verify)
+			trans_for_each_update(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (bch2_inject_invalid_keys)
+			trans_for_each_update(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	if (trans->fs_usage_deltas &&
+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+		return -BCH_ERR_btree_insert_need_mark_replicas;
+
+	if (trans->nr_wb_updates) {
+		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+		ret = bch2_btree_insert_keys_write_buffer(trans);
+		if (ret)
+			goto revert_fs_usage;
+	}
+
+	h = trans->hooks;
+	while (h) {
+		ret = h->fn(trans, h);
+		if (ret)
+			goto revert_fs_usage;
+		h = h->next;
+	}
+
+	trans_for_each_update(trans, i)
+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+			ret = run_one_mem_trigger(trans, i, i->flags);
+			if (ret)
+				goto fatal_err;
+		}
+
+	if (unlikely(c->gc_pos.phase)) {
+		ret = bch2_trans_commit_run_gc_triggers(trans);
+		if  (ret)
+			goto fatal_err;
+	}
+
+	if (unlikely(trans->extra_journal_entries.nr)) {
+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+				  trans->extra_journal_entries.data,
+				  trans->extra_journal_entries.nr);
+
+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
+	}
+
+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		struct journal *j = &c->journal;
+		struct jset_entry *entry;
+
+		trans_for_each_update(trans, i) {
+			if (i->key_cache_already_flushed)
+				continue;
+
+			if (i->flags & BTREE_UPDATE_NOJOURNAL)
+				continue;
+
+			verify_update_old_key(trans, i);
+
+			if (trans->journal_transaction_names) {
+				entry = bch2_journal_add_entry(j, &trans->journal_res,
+						       BCH_JSET_ENTRY_overwrite,
+						       i->btree_id, i->level,
+						       i->old_k.u64s);
+				bkey_reassemble((struct bkey_i *) entry->start,
+						(struct bkey_s_c) { &i->old_k, i->old_v });
+			}
+
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       i->btree_id, i->level,
+					       i->k->k.u64s);
+			bkey_copy((struct bkey_i *) entry->start, i->k);
+		}
+
+		trans_for_each_wb_update(trans, wb) {
+			entry = bch2_journal_add_entry(j, &trans->journal_res,
+					       BCH_JSET_ENTRY_btree_keys,
+					       wb->btree, 0,
+					       wb->k.k.u64s);
+			bkey_copy((struct bkey_i *) entry->start, &wb->k);
+		}
+
+		if (trans->journal_seq)
+			*trans->journal_seq = trans->journal_res.seq;
+	}
+
+	trans_for_each_update(trans, i) {
+		i->k->k.needs_whiteout = false;
+
+		if (!i->cached) {
+			u64 seq = trans->journal_res.seq;
+
+			if (i->flags & BTREE_UPDATE_PREJOURNAL)
+				seq = i->seq;
+
+			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+		} else if (!i->key_cache_already_flushed)
+			bch2_btree_insert_key_cached(trans, flags, i);
+		else {
+			bch2_btree_key_cache_drop(trans, i->path);
+			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+		}
+	}
+
+	return 0;
+fatal_err:
+	bch2_fatal_error(c);
+revert_fs_usage:
+	if (trans->fs_usage_deltas)
+		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+	return ret;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+	struct btree_write_buffered_key *wb;
+
+	trans_for_each_update(trans, i)
+		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+	trans_for_each_wb_update(trans, wb)
+		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+						   enum bkey_invalid_flags flags,
+						   struct btree_insert_entry *i,
+						   struct printbuf *err)
+{
+	struct bch_fs *c = trans->c;
+
+	printbuf_reset(err);
+	prt_printf(err, "invalid bkey on insert from %s -> %ps",
+		   trans->fn, (void *) i->ip_allocated);
+	prt_newline(err);
+	printbuf_indent_add(err, 2);
+
+	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+	prt_newline(err);
+
+	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
+	bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+	bch2_inconsistent_error(c);
+	bch2_dump_trans_updates(trans);
+
+	return -EINVAL;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+				       struct btree_insert_entry **stopped_at,
+				       unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0, u64s_delta = 0;
+
+	trans_for_each_update(trans, i) {
+		if (i->cached)
+			continue;
+
+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+		u64s_delta -= i->old_btree_u64s;
+
+		if (!same_leaf_as_next(trans, i)) {
+			if (u64s_delta <= 0) {
+				ret = bch2_foreground_maybe_merge(trans, i->path,
+							i->level, flags);
+				if (unlikely(ret))
+					return ret;
+			}
+
+			u64s_delta = 0;
+		}
+	}
+
+	ret = bch2_trans_lock_write(trans);
+	if (unlikely(ret))
+		return ret;
+
+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+	if (!ret && unlikely(trans->journal_replay_not_finished))
+		bch2_drop_overwrites_from_journal(trans);
+
+	bch2_trans_unlock_write(trans);
+
+	if (!ret && trans->journal_pin)
+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+				     trans->journal_pin, NULL);
+
+	/*
+	 * Drop journal reservation after dropping write locks, since dropping
+	 * the journal reservation may kick off a journal write:
+	 */
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	return ret;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+	int ret = bch2_journal_error(&c->journal) ?:
+		!bch2_btree_key_cache_must_wait(c);
+
+	if (!ret)
+		journal_reclaim_kick(&c->journal);
+	return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+			    struct btree_insert_entry *i,
+			    int ret, unsigned long trace_ip)
+{
+	struct bch_fs *c = trans->c;
+
+	switch (ret) {
+	case -BCH_ERR_btree_insert_btree_node_full:
+		ret = bch2_btree_split_leaf(trans, i->path, flags);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+		break;
+	case -BCH_ERR_btree_insert_need_mark_replicas:
+		ret = drop_locks_do(trans,
+			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+		break;
+	case -BCH_ERR_journal_res_get_blocked:
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			break;
+		}
+
+		ret = drop_locks_do(trans,
+			bch2_trans_journal_res_get(trans,
+					(flags & BCH_WATERMARK_MASK)|
+					JOURNAL_RES_GET_CHECK));
+		break;
+	case -BCH_ERR_btree_insert_need_journal_reclaim:
+		bch2_trans_unlock(trans);
+
+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+
+		wait_event_freezable(c->journal.reclaim_wait,
+				     (ret = journal_reclaim_wait_done(c)));
+		if (ret < 0)
+			break;
+
+		ret = bch2_trans_relock(trans);
+		break;
+	case -BCH_ERR_btree_insert_need_flush_buffer: {
+		struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+		ret = 0;
+
+		if (wb->state.nr > wb->size * 3 / 4) {
+			bch2_trans_unlock(trans);
+			mutex_lock(&wb->flush_lock);
+
+			if (wb->state.nr > wb->size * 3 / 4) {
+				bch2_trans_begin(trans);
+				ret = __bch2_btree_write_buffer_flush(trans,
+						flags|BTREE_INSERT_NOCHECK_RW, true);
+				if (!ret) {
+					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+				}
+			} else {
+				mutex_unlock(&wb->flush_lock);
+				ret = bch2_trans_relock(trans);
+			}
+		}
+		break;
+	}
+	default:
+		BUG_ON(ret >= 0);
+		break;
+	}
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+				!(flags & BTREE_INSERT_NOWAIT) &&
+				(flags & BTREE_INSERT_NOFAIL), c,
+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+	return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+	    test_bit(BCH_FS_STARTED, &c->flags))
+		return -BCH_ERR_erofs_trans_commit;
+
+	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
+	if (ret)
+		return ret;
+
+	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
+	return 0;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	int ret = 0;
+
+	trans_for_each_update(trans, i) {
+		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i = NULL;
+	struct btree_write_buffered_key *wb;
+	int ret = 0;
+
+	if (!trans->nr_updates &&
+	    !trans->nr_wb_updates &&
+	    !trans->extra_journal_entries.nr)
+		goto out_reset;
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+
+	ret = bch2_trans_commit_run_triggers(trans);
+	if (ret)
+		goto out_reset;
+
+	trans_for_each_update(trans, i) {
+		struct printbuf buf = PRINTBUF;
+		enum bkey_invalid_flags invalid_flags = 0;
+
+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+					       i->bkey_type, invalid_flags, &buf)))
+			ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
+		btree_insert_entry_checks(trans, i);
+		printbuf_exit(&buf);
+
+		if (ret)
+			return ret;
+	}
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		ret = do_bch2_trans_commit_to_journal_replay(trans);
+		goto out_reset;
+	}
+
+	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+		ret = bch2_trans_commit_get_rw_cold(trans, flags);
+		if (ret)
+			goto out_reset;
+	}
+
+	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+		bch2_trans_begin(trans);
+		bch2_trans_unlock(trans);
+
+		ret = __bch2_btree_write_buffer_flush(trans,
+					flags|BTREE_INSERT_NOCHECK_RW, true);
+		if (!ret) {
+			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+		}
+		goto out;
+	}
+
+	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+	trans->journal_u64s		= trans->extra_journal_entries.nr;
+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+	if (trans->journal_transaction_names)
+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+	trans_for_each_update(trans, i) {
+		EBUG_ON(!i->path->should_be_locked);
+
+		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+		if (unlikely(ret))
+			goto out;
+
+		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+		if (i->key_cache_already_flushed)
+			continue;
+
+		if (i->flags & BTREE_UPDATE_NOJOURNAL)
+			continue;
+
+		/* we're going to journal the key being updated: */
+		trans->journal_u64s += jset_u64s(i->k->k.u64s);
+
+		/* and we're also going to log the overwrite: */
+		if (trans->journal_transaction_names)
+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
+	}
+
+	trans_for_each_wb_update(trans, wb)
+		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
+	if (trans->extra_journal_res) {
+		ret = bch2_disk_reservation_add(c, trans->disk_res,
+				trans->extra_journal_res,
+				(flags & BTREE_INSERT_NOFAIL)
+				? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			goto err;
+	}
+retry:
+	bch2_trans_verify_not_in_restart(trans);
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+
+	/* make sure we didn't drop or screw up locks: */
+	bch2_trans_verify_locks(trans);
+
+	if (ret)
+		goto err;
+
+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+	if (!ret)
+		bch2_trans_downgrade(trans);
+	bch2_trans_reset_updates(trans);
+
+	return ret;
+err:
+	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+	if (ret)
+		goto out;
+
+	goto retry;
+}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
new file mode 100644
index 000000000000..60453ba86c4b
--- /dev/null
+++ b/fs/bcachefs/btree_types.h
@@ -0,0 +1,725 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+#include "btree_key_cache_types.h"
+#include "buckets_types.h"
+#include "darray.h"
+#include "errcode.h"
+#include "journal_types.h"
+#include "replicas_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+struct btree_trans;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+};
+
+struct btree_alloc {
+	struct open_buckets	ob;
+	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+struct btree_bkey_cached_common {
+	struct six_lock		lock;
+	u8			level;
+	u8			btree_id;
+	bool			cached;
+};
+
+struct btree {
+	struct btree_bkey_cached_common c;
+
+	struct rhash_head	hash;
+	u64			hash_val;
+
+	unsigned long		flags;
+	u16			written;
+	u8			nsets;
+	u8			nr_key_bits;
+	u16			version_ondisk;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u8			byte_order;
+	u8			unpack_fn_len;
+
+	struct btree_write	writes[2];
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	/*
+	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+	 * fails because the lock sequence number has changed - i.e. the
+	 * contents were modified - we can still relock the node if it's still
+	 * the one we want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	/*
+	 * Also for asynchronous splits/interior node updates:
+	 * If a btree node isn't reachable yet, we don't want to kick off
+	 * another write - because that write also won't yet be reachable and
+	 * marking it as completed before it's reachable would be incorrect:
+	 */
+	unsigned long		will_make_reachable;
+
+	struct open_buckets	ob;
+
+	/* lru list */
+	struct list_head	list;
+};
+
+struct btree_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		lock;
+	struct list_head	live;
+	struct list_head	freeable;
+	struct list_head	freed_pcpu;
+	struct list_head	freed_nonpcpu;
+
+	/* Number of elements in live + freeable lists */
+	unsigned		used;
+	unsigned		reserve;
+	atomic_t		dirty;
+	struct shrinker		*shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct task_struct	*alloc_lock;
+	struct closure_waitlist	alloc_wait;
+};
+
+struct btree_node_iter {
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
+static const __maybe_unused u16 BTREE_ITER_SLOTS		= 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS		= 1 << 1;
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
+static const __maybe_unused u16 BTREE_ITER_INTENT		= 1 << 2;
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
+static const __maybe_unused u16 BTREE_ITER_PREFETCH		= 1 << 3;
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS		= 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS		= 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED		= 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE	= 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES		= 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL		= 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS	= 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS	= 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS	= 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE		= 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL	= 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL	= 1 << 15;
+#define __BTREE_ITER_FLAGS_END					       16
+
+enum btree_path_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_RELOCK		= 1,
+	BTREE_ITER_NEED_TRAVERSE	= 2,
+};
+
+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG)
+#define TRACK_PATH_ALLOCATED
+#endif
+
+struct btree_path {
+	u8			idx;
+	u8			sorted_idx;
+	u8			ref;
+	u8			intent_ref;
+	u32			alloc_seq;
+	u32			downgrade_seq;
+
+	/* btree_iter_copy starts here: */
+	struct bpos		pos;
+
+	enum btree_id		btree_id:5;
+	bool			cached:1;
+	bool			preserve:1;
+	enum btree_path_uptodate uptodate:2;
+	/*
+	 * When true, failing to relock this path will cause the transaction to
+	 * restart:
+	 */
+	bool			should_be_locked:1;
+	unsigned		level:3,
+				locks_want:3;
+	u8			nodes_locked;
+
+	struct btree_path_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+		u32		lock_seq;
+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
+		u64             lock_taken_time;
+#endif
+	}			l[BTREE_MAX_DEPTH];
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+	return path->l + path->level;
+}
+
+static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
+{
+#ifdef TRACK_PATH_ALLOCATED
+	return path->ip_allocated;
+#else
+	return _THIS_IP_;
+#endif
+}
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct btree_trans	*trans;
+	struct btree_path	*path;
+	struct btree_path	*update_path;
+	struct btree_path	*key_cache_path;
+
+	enum btree_id		btree_id:8;
+	unsigned		min_depth:3;
+	unsigned		advanced:1;
+
+	/* btree_iter_copy starts here: */
+	u16			flags;
+
+	/* When we're filtering by snapshot, the snapshot ID we're looking for: */
+	unsigned		snapshot;
+
+	struct bpos		pos;
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/* BTREE_ITER_WITH_JOURNAL: */
+	size_t			journal_idx;
+	struct bpos		journal_pos;
+#ifdef TRACK_PATH_ALLOCATED
+	unsigned long		ip_allocated;
+#endif
+};
+
+#define BKEY_CACHED_ACCESSED		0
+#define BKEY_CACHED_DIRTY		1
+
+struct bkey_cached {
+	struct btree_bkey_cached_common c;
+
+	unsigned long		flags;
+	u16			u64s;
+	bool			valid;
+	u32			btree_trans_barrier_seq;
+	struct bkey_cached_key	key;
+
+	struct rhash_head	hash;
+	struct list_head	list;
+
+	struct journal_entry_pin journal;
+	u64			seq;
+
+	struct bkey_i		*k;
+};
+
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+	return !b->cached
+		? container_of(b, struct btree, c)->key.k.p
+		: container_of(b, struct bkey_cached, c)->key.pos;
+}
+
+struct btree_insert_entry {
+	unsigned		flags;
+	u8			bkey_type;
+	enum btree_id		btree_id:8;
+	u8			level:4;
+	bool			cached:1;
+	bool			insert_trigger_run:1;
+	bool			overwrite_trigger_run:1;
+	bool			key_cache_already_flushed:1;
+	/*
+	 * @old_k may be a key from the journal; @old_btree_u64s always refers
+	 * to the size of the key being overwritten in the btree:
+	 */
+	u8			old_btree_u64s;
+	struct bkey_i		*k;
+	struct btree_path	*path;
+	u64			seq;
+	/* key being overwritten: */
+	struct bkey		old_k;
+	const struct bch_val	*old_v;
+	unsigned long		ip_allocated;
+};
+
+#define BTREE_ITER_MAX		64
+
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+	btree_trans_commit_hook_fn	*fn;
+	struct btree_trans_commit_hook	*next;
+};
+
+#define BTREE_TRANS_MEM_MAX	(1U << 16)
+
+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS	10000
+
+struct btree_trans {
+	struct bch_fs		*c;
+	const char		*fn;
+	struct closure		ref;
+	struct list_head	list;
+	u64			last_begin_time;
+
+	u8			lock_may_not_fail;
+	u8			lock_must_abort;
+	struct btree_bkey_cached_common *locking;
+	struct six_lock_waiter	locking_wait;
+
+	int			srcu_idx;
+
+	u8			fn_idx;
+	u8			nr_sorted;
+	u8			nr_updates;
+	u8			nr_wb_updates;
+	u8			wb_updates_size;
+	bool			srcu_held:1;
+	bool			used_mempool:1;
+	bool			in_traverse_all:1;
+	bool			paths_sorted:1;
+	bool			memory_allocation_failure:1;
+	bool			journal_transaction_names:1;
+	bool			journal_replay_not_finished:1;
+	bool			notrace_relock_fail:1;
+	bool			write_locked:1;
+	enum bch_errcode	restarted:16;
+	u32			restart_count;
+	unsigned long		last_begin_ip;
+	unsigned long		last_restarted_ip;
+	unsigned long		srcu_lock_time;
+
+	/*
+	 * For when bch2_trans_update notices we'll be splitting a compressed
+	 * extent:
+	 */
+	unsigned		extra_journal_res;
+	unsigned		nr_max_paths;
+
+	u64			paths_allocated;
+
+	unsigned		mem_top;
+	unsigned		mem_max;
+	unsigned		mem_bytes;
+	void			*mem;
+
+	u8			sorted[BTREE_ITER_MAX + 8];
+	struct btree_path	paths[BTREE_ITER_MAX];
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
+	struct btree_write_buffered_key *wb_updates;
+
+	/* update path: */
+	struct btree_trans_commit_hook *hooks;
+	darray_u64		extra_journal_entries;
+	struct journal_entry_pin *journal_pin;
+
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct disk_reservation *disk_res;
+	unsigned		journal_u64s;
+	struct replicas_delta_list *fs_usage_deltas;
+};
+
+#define BCH_BTREE_WRITE_TYPES()						\
+	x(initial,		0)					\
+	x(init_next_bset,	1)					\
+	x(cache_reclaim,	2)					\
+	x(journal_reclaim,	3)					\
+	x(interior,		4)
+
+enum btree_write_type {
+#define x(t, n) BTREE_WRITE_##t,
+	BCH_BTREE_WRITE_TYPES()
+#undef x
+	BTREE_WRITE_TYPE_NR,
+};
+
+#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
+#define BTREE_WRITE_TYPE_BITS	ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR))
+
+#define BTREE_FLAGS()							\
+	x(read_in_flight)						\
+	x(read_error)							\
+	x(dirty)							\
+	x(need_write)							\
+	x(write_blocked)						\
+	x(will_make_reachable)						\
+	x(noevict)							\
+	x(write_idx)							\
+	x(accessed)							\
+	x(write_in_flight)						\
+	x(write_in_flight_inner)					\
+	x(just_written)							\
+	x(dying)							\
+	x(fake)								\
+	x(need_rewrite)							\
+	x(never_write)
+
+enum btree_flags {
+	/* First bits for btree node write type */
+	BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1,
+#define x(flag)	BTREE_NODE_##flag,
+	BTREE_FLAGS()
+#undef x
+};
+
+#define x(flag)								\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+BTREE_FLAGS()
+#undef x
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+	return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+	u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+	EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+	return ret;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = __btree_node_ptr_to_offset(b, i);
+	set_btree_bset_end(b, t);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	return __btree_node_ptr_to_offset(b, k);
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return __btree_node_offset_to_ptr(b, k);
+}
+
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+	return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t)					\
+({									\
+	EBUG_ON(bset(_b, _t)->start !=					\
+		__btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+									\
+	bset(_b, _t)->start;						\
+})
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		vstruct_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+	return t->end_offset - t->data_offset -
+		sizeof(struct bset) / sizeof(u64);
+}
+
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
+{
+	return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+enum btree_node_type {
+	BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
+	BCH_BTREE_IDS()
+#undef x
+	BKEY_TYPE_NR
+};
+
+/* Type of a key in btree @id at level @level: */
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_btree : (unsigned) id + 1;
+}
+
+/* Type of keys @b contains: */
+static inline enum btree_node_type btree_node_type(struct btree *b)
+{
+	return __btree_node_type(b->c.level, b->c.btree_id);
+}
+
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
+	(BIT_ULL(BKEY_TYPE_extents)|			\
+	 BIT_ULL(BKEY_TYPE_alloc)|			\
+	 BIT_ULL(BKEY_TYPE_inodes)|			\
+	 BIT_ULL(BKEY_TYPE_stripes)|			\
+	 BIT_ULL(BKEY_TYPE_reflink)|			\
+	 BIT_ULL(BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
+	(BIT_ULL(BKEY_TYPE_alloc)|			\
+	 BIT_ULL(BKEY_TYPE_inodes)|			\
+	 BIT_ULL(BKEY_TYPE_stripes)|			\
+	 BIT_ULL(BKEY_TYPE_snapshots))
+
+#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
+	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
+	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+{
+	return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
+}
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << type) & mask;
+}
+
+static inline bool btree_id_is_extents(enum btree_id btree)
+{
+	return btree_node_type_is_extents(__btree_node_type(0, btree));
+}
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
+}
+
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
+}
+
+static inline bool btree_type_has_ptrs(enum btree_id id)
+{
+	const unsigned mask = 0
+#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
+	BCH_BTREE_IDS()
+#undef x
+	;
+
+	return (1U << id) & mask;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+	s8			error;
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
new file mode 100644
index 000000000000..2fd3c8cc6f51
--- /dev/null
+++ b/fs/bcachefs/btree_update.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "snapshot.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+					 const struct btree_insert_entry *r)
+{
+	return   cmp_int(l->btree_id,	r->btree_id) ?:
+		 cmp_int(l->cached,	r->cached) ?:
+		 -cmp_int(l->level,	r->level) ?:
+		 bpos_cmp(l->k->k.p,	r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+			  struct bkey_i *, enum btree_update_flags,
+			  unsigned long ip);
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       struct bkey_s_c k,
+				       struct bkey_i **insert,
+				       enum btree_update_flags flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *update;
+	int ret;
+
+	update = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(update);
+	if (ret)
+		return ret;
+
+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+		return 0;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	ret = bch2_btree_delete_at(trans, iter, flags);
+	if (ret)
+		return ret;
+
+	*insert = update;
+	return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+				      struct btree_iter *iter,
+				      struct bkey_i *insert,
+				      struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	int ret;
+
+	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		return 0;
+
+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+	return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+				      enum btree_id btree_id, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot = pos.snapshot;
+	int ret;
+
+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+		return 0;
+
+	pos.snapshot++;
+
+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
+			   BTREE_ITER_ALL_SNAPSHOTS|
+			   BTREE_ITER_NOPRESERVE, k, ret) {
+		if (!bkey_eq(k.k->p, pos))
+			break;
+
+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+					      k.k->p.snapshot)) {
+			ret = !bkey_whiteout(k.k);
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+				   enum btree_id id,
+				   struct bpos old_pos,
+				   struct bpos new_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter old_iter, new_iter = { NULL };
+	struct bkey_s_c old_k, new_k;
+	snapshot_id_list s;
+	struct bkey_i *update;
+	int ret = 0;
+
+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+		return 0;
+
+	darray_init(&s);
+
+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+	       !(ret = bkey_err(old_k)) &&
+	       bkey_eq(old_pos, old_k.k->p)) {
+		struct bpos whiteout_pos =
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+			continue;
+
+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+					   BTREE_ITER_NOT_EXTENTS|
+					   BTREE_ITER_INTENT);
+		ret = bkey_err(new_k);
+		if (ret)
+			break;
+
+		if (new_k.k->type == KEY_TYPE_deleted) {
+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+			ret = PTR_ERR_OR_ZERO(update);
+			if (ret)
+				break;
+
+			bkey_init(&update->k);
+			update->k.p		= whiteout_pos;
+			update->k.type		= KEY_TYPE_whiteout;
+
+			ret = bch2_trans_update(trans, &new_iter, update,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		}
+		bch2_trans_iter_exit(trans, &new_iter);
+
+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &new_iter);
+	bch2_trans_iter_exit(trans, &old_iter);
+	darray_exit(&s);
+
+	return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+				       struct btree_iter *iter,
+				       enum btree_update_flags flags,
+				       struct bkey_s_c old,
+				       struct bkey_s_c new)
+{
+	enum btree_id btree_id = iter->btree_id;
+	struct bkey_i *update;
+	struct bpos new_start = bkey_start_pos(new.k);
+	unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+	unsigned back_split  = bkey_gt(old.k->p, new.k->p);
+	unsigned middle_split = (front_split || back_split) &&
+		old.k->p.snapshot != new.k->p.snapshot;
+	unsigned nr_splits = front_split + back_split + middle_split;
+	int ret = 0, compressed_sectors;
+
+	/*
+	 * If we're going to be splitting a compressed extent, note it
+	 * so that __bch2_trans_commit() can increase our disk
+	 * reservation:
+	 */
+	if (nr_splits > 1 &&
+	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+		trans->extra_journal_res += compressed_sectors * (nr_splits - 1);
+
+	if (front_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_back(new_start, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	/* If we're overwriting in a different snapshot - middle split: */
+	if (middle_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new_start, update);
+		bch2_cut_back(new.k->p, update);
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+					old.k->p, update->k.p) ?:
+			bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (bkey_le(old.k->p, new.k->p)) {
+		update = bch2_trans_kmalloc(trans, sizeof(*update));
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bkey_init(&update->k);
+		update->k.p = old.k->p;
+		update->k.p.snapshot = new.k->p.snapshot;
+
+		if (new.k->p.snapshot != old.k->p.snapshot) {
+			update->k.type = KEY_TYPE_whiteout;
+		} else if (btree_type_has_snapshots(btree_id)) {
+			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+			if (ret < 0)
+				return ret;
+			if (ret)
+				update->k.type = KEY_TYPE_whiteout;
+		}
+
+		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+		if (ret)
+			return ret;
+	}
+
+	if (back_split) {
+		update = bch2_bkey_make_mut_noupdate(trans, old);
+		if ((ret = PTR_ERR_OR_ZERO(update)))
+			return ret;
+
+		bch2_cut_front(new.k->p, update);
+
+		ret = bch2_trans_update_by_path(trans, iter->path, update,
+					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+					  flags, _RET_IP_);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+				    struct btree_iter *orig_iter,
+				    struct bkey_i *insert,
+				    enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id btree_id = orig_iter->btree_id;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+			     BTREE_ITER_INTENT|
+			     BTREE_ITER_WITH_UPDATES|
+			     BTREE_ITER_NOT_EXTENTS);
+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+	if ((ret = bkey_err(k)))
+		goto err;
+	if (!k.k)
+		goto out;
+
+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
+			if (ret)
+				goto err;
+		}
+
+		goto next;
+	}
+
+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+		bool done = bkey_lt(insert->k.p, k.k->p);
+
+		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+		if (ret)
+			goto err;
+
+		if (done)
+			goto out;
+next:
+		bch2_btree_iter_advance(&iter);
+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto err;
+		if (!k.k)
+			goto out;
+	}
+
+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+		ret = extent_back_merge(trans, &iter, insert, k);
+		if (ret)
+			goto err;
+	}
+out:
+	if (!bkey_deleted(&insert->k))
+		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+					    struct btree_path *path,
+					    struct btree_insert_entry *i,
+					    enum btree_update_flags flags,
+					    unsigned long ip)
+{
+	struct btree_path *btree_path;
+	struct bkey k;
+	int ret;
+
+	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+				   BTREE_ITER_INTENT, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, btree_path, 0);
+	if (ret)
+		goto out;
+
+	/*
+	 * The old key in the insert entry might actually refer to an existing
+	 * key in the btree that has been deleted from cache and not yet
+	 * flushed. Check for this and skip the flush so we don't run triggers
+	 * against a stale key.
+	 */
+	bch2_btree_path_peek_slot_exact(btree_path, &k);
+	if (!bkey_deleted(&k))
+		goto out;
+
+	i->key_cache_already_flushed = true;
+	i->flags |= BTREE_TRIGGER_NORUN;
+
+	btree_path_set_should_be_locked(btree_path);
+	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+	bch2_path_put(trans, btree_path, true);
+	return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+			  struct bkey_i *k, enum btree_update_flags flags,
+			  unsigned long ip)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i, n;
+	u64 seq = 0;
+	int cmp;
+
+	EBUG_ON(!path->should_be_locked);
+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+	/*
+	 * The transaction journal res hasn't been allocated at this point.
+	 * That occurs at commit time. Reuse the seq field to pass in the seq
+	 * of a prejournaled key.
+	 */
+	if (flags & BTREE_UPDATE_PREJOURNAL)
+		seq = trans->journal_res.seq;
+
+	n = (struct btree_insert_entry) {
+		.flags		= flags,
+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
+		.btree_id	= path->btree_id,
+		.level		= path->level,
+		.cached		= path->cached,
+		.path		= path,
+		.k		= k,
+		.seq		= seq,
+		.ip_allocated	= ip,
+	};
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	trans_for_each_update(trans, i)
+		BUG_ON(i != trans->updates &&
+		       btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+	/*
+	 * Pending updates are kept sorted: first, find position of new update,
+	 * then delete/trim any updates the new update overwrites:
+	 */
+	trans_for_each_update(trans, i) {
+		cmp = btree_insert_entry_cmp(&n, i);
+		if (cmp <= 0)
+			break;
+	}
+
+	if (!cmp && i < trans->updates + trans->nr_updates) {
+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+		bch2_path_put(trans, i->path, true);
+		i->flags	= n.flags;
+		i->cached	= n.cached;
+		i->k		= n.k;
+		i->path		= n.path;
+		i->seq		= n.seq;
+		i->ip_allocated	= n.ip_allocated;
+	} else {
+		array_insert_item(trans->updates, trans->nr_updates,
+				  i - trans->updates, n);
+
+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+		if (unlikely(trans->journal_replay_not_finished)) {
+			struct bkey_i *j_k =
+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+			if (j_k) {
+				i->old_k = j_k->k;
+				i->old_v = &j_k->v;
+			}
+		}
+	}
+
+	__btree_path_get(i->path, true);
+
+	/*
+	 * If a key is present in the key cache, it must also exist in the
+	 * btree - this is necessary for cache coherency. When iterating over
+	 * a btree that's cached in the key cache, the btree iter code checks
+	 * the key cache - but the key has to exist in the btree for that to
+	 * work:
+	 */
+	if (path->cached && bkey_deleted(&i->old_k))
+		return flush_new_cached_update(trans, path, i, flags, ip);
+
+	return 0;
+}
+
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+						    struct btree_iter *iter,
+						    struct btree_path *path)
+{
+	if (!iter->key_cache_path ||
+	    !iter->key_cache_path->should_be_locked ||
+	    !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+		struct bkey_cached *ck;
+		int ret;
+
+		if (!iter->key_cache_path)
+			iter->key_cache_path =
+				bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+					      BTREE_ITER_INTENT|
+					      BTREE_ITER_CACHED, _THIS_IP_);
+
+		iter->key_cache_path =
+			bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+						iter->flags & BTREE_ITER_INTENT,
+						_THIS_IP_);
+
+		ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+					       BTREE_ITER_CACHED);
+		if (unlikely(ret))
+			return ret;
+
+		ck = (void *) iter->key_cache_path->l[0].b;
+
+		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+			trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+		}
+
+		btree_path_set_should_be_locked(iter->key_cache_path);
+	}
+
+	return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+				   struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_path *path = iter->update_path ?: iter->path;
+	int ret;
+
+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
+		return bch2_trans_update_extent(trans, iter, k, flags);
+
+	if (bkey_deleted(&k->k) &&
+	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+		if (unlikely(ret < 0))
+			return ret;
+
+		if (ret)
+			k->k.type = KEY_TYPE_whiteout;
+	}
+
+	/*
+	 * Ensure that updates to cached btrees go to the key cache:
+	 */
+	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+	    !path->cached &&
+	    !path->level &&
+	    btree_id_cached(trans->c, path->btree_id)) {
+		ret = bch2_trans_update_get_key_cache(trans, iter, path);
+		if (ret)
+			return ret;
+
+		path = iter->key_cache_path;
+	}
+
+	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+				       struct btree_iter *iter, struct bkey_i *k,
+				       enum btree_update_flags flags)
+{
+	trans->journal_res.seq = seq;
+	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+						 BTREE_UPDATE_PREJOURNAL);
+}
+
+static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+						  enum btree_id btree,
+						  struct bkey_i *k)
+{
+	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bkey_copy(n, k);
+	return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+					    enum btree_id btree,
+					    struct bkey_i *k)
+{
+	struct btree_write_buffered_key *i;
+	int ret;
+
+	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+	if (unlikely(trans->journal_replay_not_finished))
+		return bch2_btree_insert_clone_trans(trans, btree, k);
+
+	trans_for_each_wb_update(trans, i) {
+		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+			bkey_copy(&i->k, k);
+			return 0;
+		}
+	}
+
+	if (!trans->wb_updates ||
+	    trans->nr_wb_updates == trans->wb_updates_size) {
+		struct btree_write_buffered_key *u;
+
+		if (trans->nr_wb_updates == trans->wb_updates_size) {
+			struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+			if (s)
+				s->wb_updates_size = trans->wb_updates_size;
+		}
+
+		u = bch2_trans_kmalloc_nomemzero(trans,
+					trans->wb_updates_size *
+					sizeof(struct btree_write_buffered_key));
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		if (trans->nr_wb_updates)
+			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+			       sizeof(struct btree_write_buffered_key));
+		trans->wb_updates = u;
+	}
+
+	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+		.btree	= btree,
+	};
+
+	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+	trans->nr_wb_updates++;
+
+	return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+			     enum btree_id btree, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_prev(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_advance(iter);
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+	if (bkey_gt(k.k->p, end)) {
+		ret = -BCH_ERR_ENOSPC_btree_slot;
+		goto err;
+	}
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+			    struct btree_trans_commit_hook *h)
+{
+	h->next = trans->hooks;
+	trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+				enum btree_id btree, struct bkey_i *k,
+				enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+			    struct bkey_i *k, enum btree_update_flags flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, flags);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @k:			key to insert
+ * @disk_res:		must be non-NULL whenever inserting or potentially
+ *			splitting data extents
+ * @flags:		transaction commit flags
+ *
+ * Returns:		0 on success, error code on failure
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+		      struct disk_reservation *disk_res, int flags)
+{
+	return bch2_trans_do(c, disk_res, NULL, flags,
+			     bch2_btree_insert_trans(trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+				unsigned len, unsigned update_flags)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	bch2_key_resize(&k->k, len);
+	return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+			 struct btree_iter *iter, unsigned update_flags)
+{
+	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+				  enum btree_id btree, struct bpos pos)
+{
+	struct bkey_i *k;
+
+	k = bch2_trans_kmalloc(trans, sizeof(*k));
+	if (IS_ERR(k))
+		return PTR_ERR(k);
+
+	bkey_init(&k->k);
+	k->k.p = pos;
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete(struct btree_trans *trans,
+		      enum btree_id btree, struct bpos pos,
+		      unsigned update_flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos,
+			     BTREE_ITER_CACHED|
+			     BTREE_ITER_INTENT);
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, update_flags);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+				  struct bpos start, struct bpos end,
+				  unsigned update_flags,
+				  u64 *journal_seq)
+{
+	u32 restart_count = trans->restart_count;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(trans->c, 0);
+		struct bkey_i delete;
+
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * This could probably be more efficient for extents:
+		 */
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+			bch2_trans_commit(trans, &disk_res, journal_seq,
+					  BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+		/*
+		 * the bch2_trans_begin() call is in a weird place because we
+		 * need to call it after every transaction commit, to avoid path
+		 * overflow, but don't want to call it if the delete operation
+		 * is a no-op and we have no work to do:
+		 */
+		bch2_trans_begin(trans);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			    struct bpos start, struct bpos end,
+			    unsigned update_flags,
+			    u64 *journal_seq)
+{
+	int ret = bch2_trans_run(c,
+			bch2_btree_delete_range_trans(trans, id, start, end,
+						      update_flags, journal_seq));
+	if (ret == -BCH_ERR_transaction_restart_nested)
+		ret = 0;
+	return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+		       struct bpos pos, bool set)
+{
+	struct bkey_i *k;
+	int ret = 0;
+
+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	return bch2_trans_update_buffered(trans, btree, k);
+}
+
+__printf(2, 0)
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+	struct printbuf buf = PRINTBUF;
+	struct jset_entry_log *l;
+	unsigned u64s;
+	int ret;
+
+	prt_vprintf(&buf, fmt, args);
+	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+	if (ret)
+		goto err;
+
+	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+	ret = darray_make_room(entries, jset_u64s(u64s));
+	if (ret)
+		goto err;
+
+	l = (void *) &darray_top(*entries);
+	l->entry.u64s		= cpu_to_le16(u64s);
+	l->entry.btree_id	= 0;
+	l->entry.level		= 1;
+	l->entry.type		= BCH_JSET_ENTRY_log;
+	l->entry.pad[0]		= 0;
+	l->entry.pad[1]		= 0;
+	l->entry.pad[2]		= 0;
+	memcpy(l->d, buf.buf, buf.pos);
+	while (buf.pos & 7)
+		l->d[buf.pos++] = '\0';
+
+	entries->nr += jset_u64s(u64s);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+__printf(3, 0)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+		  va_list args)
+{
+	int ret;
+
+	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+	} else {
+		ret = bch2_trans_do(c, NULL, NULL,
+			BTREE_INSERT_LAZY_RW|commit_flags,
+			__bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+	}
+
+	return ret;
+}
+
+__printf(2, 3)
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+__printf(2, 3)
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+	va_end(args);
+	return ret;
+}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
new file mode 100644
index 000000000000..9816d2286540
--- /dev/null
+++ b/fs/bcachefs/btree_update.h
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+
+void bch2_btree_node_prep_for_write(struct btree_trans *,
+				    struct btree_path *, struct btree *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+				struct btree *, struct btree_node_iter *,
+				struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+				struct bkey_i *, u64);
+
+enum btree_insert_flags {
+	/* First bits for bch_watermark: */
+	__BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
+	__BTREE_INSERT_NOCHECK_RW,
+	__BTREE_INSERT_LAZY_RW,
+	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_JOURNAL_RECLAIM,
+	__BTREE_INSERT_NOWAIT,
+	__BTREE_INSERT_GC_LOCK_HELD,
+	__BCH_HASH_SET_MUST_CREATE,
+	__BCH_HASH_SET_MUST_REPLACE,
+};
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		BIT(__BTREE_INSERT_NOFAIL)
+
+#define BTREE_INSERT_NOCHECK_RW		BIT(__BTREE_INSERT_NOCHECK_RW)
+#define BTREE_INSERT_LAZY_RW		BIT(__BTREE_INSERT_LAZY_RW)
+
+/* Insert is for journal replay - don't get journal reservations: */
+#define BTREE_INSERT_JOURNAL_REPLAY	BIT(__BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Insert is being called from journal reclaim path: */
+#define BTREE_INSERT_JOURNAL_RECLAIM	BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT		BIT(__BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	BIT(__BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE	BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	BIT(__BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
+				unsigned, unsigned);
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
+
+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
+				struct bkey_i *, enum btree_update_flags);
+
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
+			enum btree_update_flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *, int flags);
+
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+				  struct bpos, struct bpos, unsigned, u64 *);
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			    struct bpos, struct bpos, unsigned, u64 *);
+
+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
+				     struct bpos, struct bpos);
+
+/*
+ * For use when splitting extents in existing snapshots:
+ *
+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot
+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and
+ * not visible, emit a whiteout at @new_pos.
+ */
+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+						 enum btree_id btree,
+						 struct bpos old_pos,
+						 struct bpos new_pos)
+{
+	if (!btree_type_has_snapshots(btree) ||
+	    bkey_eq(old_pos, new_pos))
+		return 0;
+
+	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
+				       enum btree_update_flags,
+				       struct bkey_s_c, struct bkey_s_c);
+
+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
+			     enum btree_id, struct bpos);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+				   struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
+				       struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+					    enum btree_id, struct bkey_i *);
+
+void bch2_trans_commit_hook(struct btree_trans *,
+			    struct btree_trans_commit_hook *);
+int __bch2_trans_commit(struct btree_trans *, unsigned);
+
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+				    struct disk_reservation *disk_res,
+				    u64 *journal_seq,
+				    unsigned flags)
+{
+	trans->disk_res		= disk_res;
+	trans->journal_seq	= journal_seq;
+
+	return __bch2_trans_commit(trans, flags);
+}
+
+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do)	\
+	nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
+					(_journal_seq), (_flags)))
+
+#define bch2_trans_run(_c, _do)						\
+({									\
+	struct btree_trans *trans = bch2_trans_get(_c);			\
+	int _ret = (_do);						\
+	bch2_trans_put(trans);						\
+	_ret;								\
+})
+
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)		\
+	bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
+#define trans_for_each_update(_trans, _i)				\
+	for ((_i) = (_trans)->updates;					\
+	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
+	     (_i)++)
+
+#define trans_for_each_wb_update(_trans, _i)				\
+	for ((_i) = (_trans)->wb_updates;				\
+	     (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates;	\
+	     (_i)++)
+
+static inline void bch2_trans_reset_updates(struct btree_trans *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_update(trans, i)
+		bch2_path_put(trans, i->path, true);
+
+	trans->extra_journal_res	= 0;
+	trans->nr_updates		= 0;
+	trans->nr_wb_updates		= 0;
+	trans->wb_updates		= NULL;
+	trans->hooks			= NULL;
+	trans->extra_journal_entries.nr	= 0;
+
+	if (trans->fs_usage_deltas) {
+		trans->fs_usage_deltas->used = 0;
+		memset((void *) trans->fs_usage_deltas +
+		       offsetof(struct replicas_delta_list, memset_start), 0,
+		       (void *) &trans->fs_usage_deltas->memset_end -
+		       (void *) &trans->fs_usage_deltas->memset_start);
+	}
+}
+
+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+						  unsigned type, unsigned min_bytes)
+{
+	unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
+	struct bkey_i *mut;
+
+	if (type && k.k->type != type)
+		return ERR_PTR(-ENOENT);
+
+	mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+	if (!IS_ERR(mut)) {
+		bkey_reassemble(mut, k);
+
+		if (unlikely(bytes > bkey_bytes(k.k))) {
+			memset((void *) mut + bkey_bytes(k.k), 0,
+			       bytes - bkey_bytes(k.k));
+			mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64));
+		}
+	}
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+{
+	return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
+}
+
+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type)		\
+	bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k,	\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+					struct bkey_s_c *k, unsigned flags,
+					unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret)
+		return ERR_PTR(ret);
+
+	*k = bkey_i_to_s_c(mut);
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
+						struct bkey_s_c *k, unsigned flags)
+{
+	return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
+}
+
+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type)	\
+	bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\
+				KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type);
+	struct bkey_i *ret = IS_ERR(k.k)
+		? ERR_CAST(k.k)
+		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
+	if (IS_ERR(ret))
+		bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 unsigned btree_id, struct bpos pos,
+					 unsigned flags, unsigned type, unsigned min_bytes)
+{
+	struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
+				btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes);
+	int ret;
+
+	if (IS_ERR(mut))
+		return mut;
+
+	ret = bch2_trans_update(trans, iter, mut, flags);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ERR_PTR(ret);
+	}
+
+	return mut;
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
+						       struct btree_iter *iter,
+						       unsigned btree_id, struct bpos pos,
+						       unsigned flags, unsigned min_bytes)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
+}
+
+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       unsigned btree_id, struct bpos pos,
+					       unsigned flags)
+{
+	return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
+}
+
+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\
+	bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter,		\
+			_btree_id, _pos, _flags,			\
+			KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
+
+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
+					       unsigned flags, unsigned type, unsigned val_size)
+{
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
+	int ret;
+
+	if (IS_ERR(k))
+		return k;
+
+	bkey_init(&k->k);
+	k->k.p = iter->pos;
+	k->k.type = type;
+	set_bkey_val_bytes(&k->k, val_size);
+
+	ret = bch2_trans_update(trans, iter, k, flags);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+	return k;
+}
+
+#define bch2_bkey_alloc(_trans, _iter, _flags, _type)			\
+	bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags,	\
+				KEY_TYPE_##_type, sizeof(struct bch_##_type)))
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
new file mode 100644
index 000000000000..239fcc3c7c99
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.c
@@ -0,0 +1,2476 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/random.h>
+
+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
+				  struct btree_path *, struct btree *,
+				  struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
+
+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
+						enum btree_id btree_id,
+						unsigned level,
+						struct bpos pos)
+{
+	struct btree_path *path;
+
+	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+			     BTREE_ITER_NOPRESERVE|
+			     BTREE_ITER_INTENT, _RET_IP_);
+	path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+	bch2_btree_path_downgrade(trans, path);
+	__bch2_btree_path_unlock(trans, path);
+	return path;
+}
+
+/* Debug code: */
+
+/*
+ * Verify that child nodes correctly span parent node's range:
+ */
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct bpos next_node = b->data->min_key;
+	struct btree_node_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_btree_ptr_v2 bp;
+	struct bkey unpacked;
+	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+	BUG_ON(!b->c.level);
+
+	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		return;
+
+	bch2_btree_node_iter_init_from_start(&iter, b);
+
+	while (1) {
+		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+		if (k.k->type != KEY_TYPE_btree_ptr_v2)
+			break;
+		bp = bkey_s_c_to_btree_ptr_v2(k);
+
+		if (!bpos_eq(next_node, bp.v->min_key)) {
+			bch2_dump_btree_node(c, b);
+			bch2_bpos_to_text(&buf1, next_node);
+			bch2_bpos_to_text(&buf2, bp.v->min_key);
+			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
+		}
+
+		bch2_btree_node_iter_advance(&iter, b);
+
+		if (bch2_btree_node_iter_end(&iter)) {
+			if (!bpos_eq(k.k->p, b->key.k.p)) {
+				bch2_dump_btree_node(c, b);
+				bch2_bpos_to_text(&buf1, b->key.k.p);
+				bch2_bpos_to_text(&buf2, k.k->p);
+				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
+			}
+			break;
+		}
+
+		next_node = bpos_successor(k.k->p);
+	}
+#endif
+}
+
+/* Calculate ideal packed bkey format for new btree nodes: */
+
+static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
+{
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	struct bkey uk;
+
+	for_each_bset(b, t)
+		bset_tree_for_each_key(b, t, k)
+			if (!bkey_deleted(k)) {
+				uk = bkey_unpack_key(b, k);
+				bch2_bkey_format_add_key(s, &uk);
+			}
+}
+
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
+{
+	struct bkey_format_state s;
+
+	bch2_bkey_format_init(&s);
+	bch2_bkey_format_add_pos(&s, b->data->min_key);
+	bch2_bkey_format_add_pos(&s, b->data->max_key);
+	__bch2_btree_calc_format(&s, b);
+
+	return bch2_bkey_format_done(&s);
+}
+
+static size_t btree_node_u64s_with_format(struct btree_nr_keys nr,
+					  struct bkey_format *old_f,
+					  struct bkey_format *new_f)
+{
+	/* stupid integer promotion rules */
+	ssize_t delta =
+	    (((int) new_f->key_u64s - old_f->key_u64s) *
+	     (int) nr.packed_keys) +
+	    (((int) new_f->key_u64s - BKEY_U64s) *
+	     (int) nr.unpacked_keys);
+
+	BUG_ON(delta + nr.live_u64s < 0);
+
+	return nr.live_u64s + delta;
+}
+
+/**
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
+ *
+ * @c:		filesystem handle
+ * @b:		btree node to rewrite
+ * @nr:		number of keys for new node (i.e. b->nr)
+ * @new_f:	bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
+ */
+static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
+				 struct btree_nr_keys nr,
+				 struct bkey_format *new_f)
+{
+	size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
+
+	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+}
+
+/* Btree node freeing/allocation: */
+
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
+{
+	trace_and_count(c, btree_node_free, c, b);
+
+	BUG_ON(btree_node_write_blocked(b));
+	BUG_ON(btree_node_dirty(b));
+	BUG_ON(btree_node_need_write(b));
+	BUG_ON(b == btree_node_root(c, b));
+	BUG_ON(b->ob.nr);
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable);
+
+	clear_btree_node_noevict(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+}
+
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+				       struct btree_path *path,
+				       struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	unsigned level = b->c.level;
+
+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	__btree_node_free(c, b);
+	six_unlock_write(&b->c.lock);
+	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
+static void bch2_btree_node_free_never_used(struct btree_update *as,
+					    struct btree_trans *trans,
+					    struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
+	struct btree_path *path;
+	unsigned level = b->c.level;
+
+	BUG_ON(!list_empty(&b->write_blocked));
+	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
+
+	b->will_make_reachable = 0;
+	closure_put(&as->cl);
+
+	clear_btree_node_will_make_reachable(b);
+	clear_btree_node_accessed(b);
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	bch2_btree_node_hash_remove(&c->btree_cache, b);
+	mutex_unlock(&c->btree_cache.lock);
+
+	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
+	p->b[p->nr++] = b;
+
+	six_unlock_intent(&b->c.lock);
+
+	trans_for_each_path(trans, path)
+		if (path->l[level].b == b) {
+			btree_node_unlock(trans, path, level);
+			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+		}
+}
+
+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
+					     struct disk_reservation *res,
+					     struct closure *cl,
+					     bool interior_node,
+					     unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct write_point *wp;
+	struct btree *b;
+	BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+	struct open_buckets obs = { .nr = 0 };
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+		? BTREE_NODE_RESERVE
+		: 0;
+	int ret;
+
+	mutex_lock(&c->btree_reserve_cache_lock);
+	if (c->btree_reserve_cache_nr > nr_reserve) {
+		struct btree_alloc *a =
+			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+		obs = a->ob;
+		bkey_copy(&tmp.k, &a->k);
+		mutex_unlock(&c->btree_reserve_cache_lock);
+		goto mem_alloc;
+	}
+	mutex_unlock(&c->btree_reserve_cache_lock);
+
+retry:
+	ret = bch2_alloc_sectors_start_trans(trans,
+				      c->opts.metadata_target ?:
+				      c->opts.foreground_target,
+				      0,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
+				      res->nr_replicas,
+				      c->opts.metadata_replicas_required,
+				      watermark, 0, cl, &wp);
+	if (unlikely(ret))
+		return ERR_PTR(ret);
+
+	if (wp->sectors_free < btree_sectors(c)) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		open_bucket_for_each(c, &wp->ptrs, ob, i)
+			if (ob->sectors_free < btree_sectors(c))
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
+		goto retry;
+	}
+
+	bkey_btree_ptr_v2_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
+
+	bch2_open_bucket_get(c, wp, &obs);
+	bch2_alloc_sectors_done(c, wp);
+mem_alloc:
+	b = bch2_btree_node_mem_alloc(trans, interior_node);
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+
+	/* we hold cannibalize_lock: */
+	BUG_ON(IS_ERR(b));
+	BUG_ON(b->ob.nr);
+
+	bkey_copy(&b->key, &tmp.k);
+	b->ob = obs;
+
+	return b;
+}
+
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+					   struct btree_trans *trans,
+					   unsigned level)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
+	int ret;
+
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+	BUG_ON(!p->nr);
+
+	b = p->b[--p->nr];
+
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+
+	set_btree_node_accessed(b);
+	set_btree_node_dirty_acct(c, b);
+	set_btree_node_need_write(b);
+
+	bch2_bset_init_first(b, &b->data->keys);
+	b->c.level	= level;
+	b->c.btree_id	= as->btree_id;
+	b->version_ondisk = c->sb.version;
+
+	memset(&b->nr, 0, sizeof(b->nr));
+	b->data->magic = cpu_to_le64(bset_magic(c));
+	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
+	b->data->flags = 0;
+	SET_BTREE_NODE_ID(b->data, as->btree_id);
+	SET_BTREE_NODE_LEVEL(b->data, level);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+		bp->v.mem_ptr		= 0;
+		bp->v.seq		= b->data->keys.seq;
+		bp->v.sectors_written	= 0;
+	}
+
+	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+	bch2_btree_build_aux_trees(b);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+	BUG_ON(ret);
+
+	trace_and_count(c, btree_node_alloc, c, b);
+	bch2_increment_clock(c, btree_sectors(c), WRITE);
+	return b;
+}
+
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+	b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+	b->key.k.p = pos;
+	b->data->max_key = pos;
+}
+
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+						       struct btree_trans *trans,
+						       struct btree *b)
+{
+	struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level);
+	struct bkey_format format = bch2_btree_calc_format(b);
+
+	/*
+	 * The keys might expand with the new format - if they wouldn't fit in
+	 * the btree node anymore, use the old format for now:
+	 */
+	if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format))
+		format = b->format;
+
+	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+
+	btree_set_min(n, b->data->min_key);
+	btree_set_max(n, b->data->max_key);
+
+	n->data->format		= format;
+	btree_node_set_format(n, format);
+
+	bch2_btree_sort_into(as->c, n, b);
+
+	btree_node_reset_sib_u64s(n);
+	return n;
+}
+
+static struct btree *__btree_root_alloc(struct btree_update *as,
+				struct btree_trans *trans, unsigned level)
+{
+	struct btree *b = bch2_btree_node_alloc(as, trans, level);
+
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
+	b->data->format = bch2_btree_calc_format(b);
+
+	btree_node_set_format(b, b->data->format);
+	bch2_btree_build_aux_trees(b);
+
+	return b;
+}
+
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+	struct prealloc_nodes *p;
+
+	for (p = as->prealloc_nodes;
+	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+	     p++) {
+		while (p->nr) {
+			struct btree *b = p->b[--p->nr];
+
+			mutex_lock(&c->btree_reserve_cache_lock);
+
+			if (c->btree_reserve_cache_nr <
+			    ARRAY_SIZE(c->btree_reserve_cache)) {
+				struct btree_alloc *a =
+					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+
+				a->ob = b->ob;
+				b->ob.nr = 0;
+				bkey_copy(&a->k, &b->key);
+			} else {
+				bch2_open_buckets_put(c, &b->ob);
+			}
+
+			mutex_unlock(&c->btree_reserve_cache_lock);
+
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+			btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+			__btree_node_free(c, b);
+			six_unlock_write(&b->c.lock);
+			six_unlock_intent(&b->c.lock);
+		}
+	}
+}
+
+static int bch2_btree_reserve_get(struct btree_trans *trans,
+				  struct btree_update *as,
+				  unsigned nr_nodes[2],
+				  unsigned flags,
+				  struct closure *cl)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	unsigned interior;
+	int ret = 0;
+
+	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
+
+	/*
+	 * Protects reaping from the btree node cache and using the btree node
+	 * open bucket reserve:
+	 *
+	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+	 * blocking on this lock:
+	 */
+	ret = bch2_btree_cache_cannibalize_lock(c, cl);
+	if (ret)
+		return ret;
+
+	for (interior = 0; interior < 2; interior++) {
+		struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+		while (p->nr < nr_nodes[interior]) {
+			b = __bch2_btree_node_alloc(trans, &as->disk_res,
+					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
+					interior, flags);
+			if (IS_ERR(b)) {
+				ret = PTR_ERR(b);
+				goto err;
+			}
+
+			p->b[p->nr++] = b;
+		}
+	}
+err:
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+}
+
+/* Asynchronous interior node update machinery */
+
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+
+	if (as->took_gc_lock)
+		up_read(&c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+	bch2_journal_pin_flush(&c->journal, &as->journal);
+	bch2_disk_reservation_put(c, &as->disk_res);
+	bch2_btree_reserve_put(as, trans);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+			       as->start_time);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_del(&as->unwritten_list);
+	list_del(&as->list);
+
+	closure_debug_destroy(&as->cl);
+	mempool_free(as, &c->btree_interior_update_pool);
+
+	/*
+	 * Have to do the wakeup with btree_interior_update_lock still held,
+	 * since being on btree_interior_update_list is our ref on @c:
+	 */
+	closure_wake_up(&c->btree_interior_update_wait);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_add_key(struct btree_update *as,
+				 struct keylist *keys, struct btree *b)
+{
+	struct bkey_i *k = &b->key;
+
+	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
+	       ARRAY_SIZE(as->_old_keys));
+
+	bkey_copy(keys->top, k);
+	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+	bch2_keylist_push(keys);
+}
+
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+					    struct btree_update *as)
+{
+	struct bkey_i *k;
+	int ret;
+
+	ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+	if (ret)
+		return ret;
+
+	memcpy(&darray_top(trans->extra_journal_entries),
+	       as->journal_entries,
+	       as->journal_u64s * sizeof(u64));
+	trans->extra_journal_entries.nr += as->journal_u64s;
+
+	trans->journal_pin = &as->journal;
+
+	for_each_keylist_key(&as->old_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+		if (ret)
+			return ret;
+	}
+
+	for_each_keylist_key(&as->new_keys, k) {
+		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void btree_update_nodes_written(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+	struct btree *b;
+	struct btree_trans *trans = bch2_trans_get(c);
+	u64 journal_seq = 0;
+	unsigned i;
+	int ret;
+
+	/*
+	 * If we're already in an error state, it might be because a btree node
+	 * was never written, and we might be trying to free that same btree
+	 * node here, but it won't have been marked as allocated and we'll see
+	 * spurious disk usage inconsistencies in the transactional part below
+	 * if we don't skip it:
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	/*
+	 * Wait for any in flight writes to finish before we free the old nodes
+	 * on disk:
+	 */
+	for (i = 0; i < as->nr_old_nodes; i++) {
+		__le64 seq;
+
+		b = as->old_nodes[i];
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		seq = b->data ? b->data->keys.seq : 0;
+		six_unlock_read(&b->c.lock);
+
+		if (seq == as->old_nodes_seq[i])
+			wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
+				       TASK_UNINTERRUPTIBLE);
+	}
+
+	/*
+	 * We did an update to a parent node where the pointers we added pointed
+	 * to child nodes that weren't written yet: now, the child nodes have
+	 * been written so we can write out the update to the interior node.
+	 */
+
+	/*
+	 * We can't call into journal reclaim here: we'd block on the journal
+	 * reclaim lock, but we may need to release the open buckets we have
+	 * pinned in order for other btree updates to make forward progress, and
+	 * journal reclaim does btree updates when flushing bkey_cached entries,
+	 * which may require allocations as well.
+	 */
+	ret = commit_do(trans, &as->disk_res, &journal_seq,
+			BCH_WATERMARK_reclaim|
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_NOCHECK_RW|
+			BTREE_INSERT_JOURNAL_RECLAIM,
+			btree_update_nodes_written_trans(trans, as));
+	bch2_trans_unlock(trans);
+
+	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+			     "%s(): error %s", __func__, bch2_err_str(ret));
+err:
+	if (as->b) {
+		struct btree_path *path;
+
+		b = as->b;
+		path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+		/*
+		 * @b is the node we did the final insert into:
+		 *
+		 * On failure to get a journal reservation, we still have to
+		 * unblock the write and allow most of the write path to happen
+		 * so that shutdown works, but the i->journal_seq mechanism
+		 * won't work to prevent the btree write from being visible (we
+		 * didn't get a journal sequence number) - instead
+		 * __bch2_btree_node_write() doesn't do the actual write if
+		 * we're in journal error state:
+		 */
+
+		/*
+		 * Ensure transaction is unlocked before using
+		 * btree_node_lock_nopath() (the use of which is always suspect,
+		 * we need to work on removing this in the future)
+		 *
+		 * It should be, but get_unlocked_mut_path() -> bch2_path_get()
+		 * calls bch2_path_upgrade(), before we call path_make_mut(), so
+		 * we may rarely end up with a locked path besides the one we
+		 * have here:
+		 */
+		bch2_trans_unlock(trans);
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
+		path->l[b->c.level].b = b;
+
+		bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+		mutex_lock(&c->btree_interior_update_lock);
+
+		list_del(&as->write_blocked_list);
+		if (list_empty(&b->write_blocked))
+			clear_btree_node_write_blocked(b);
+
+		/*
+		 * Node might have been freed, recheck under
+		 * btree_interior_update_lock:
+		 */
+		if (as->b == b) {
+			BUG_ON(!b->c.level);
+			BUG_ON(!btree_node_dirty(b));
+
+			if (!ret) {
+				struct bset *last = btree_bset_last(b);
+
+				last->journal_seq = cpu_to_le64(
+							     max(journal_seq,
+								 le64_to_cpu(last->journal_seq)));
+
+				bch2_btree_add_journal_pin(c, b, journal_seq);
+			} else {
+				/*
+				 * If we didn't get a journal sequence number we
+				 * can't write this btree node, because recovery
+				 * won't know to ignore this write:
+				 */
+				set_btree_node_never_write(b);
+			}
+		}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
+		six_unlock_write(&b->c.lock);
+
+		btree_node_write_if_need(c, b, SIX_LOCK_intent);
+		btree_node_unlock(trans, path, b->c.level);
+		bch2_path_put(trans, path, true);
+	}
+
+	bch2_journal_pin_drop(&c->journal, &as->journal);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
+
+		BUG_ON(b->will_make_reachable != (unsigned long) as);
+		b->will_make_reachable = 0;
+		clear_btree_node_will_make_reachable(b);
+	}
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	for (i = 0; i < as->nr_new_nodes; i++) {
+		b = as->new_nodes[i];
+
+		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+		btree_node_write_if_need(c, b, SIX_LOCK_read);
+		six_unlock_read(&b->c.lock);
+	}
+
+	for (i = 0; i < as->nr_open_buckets; i++)
+		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
+
+	bch2_btree_update_free(as, trans);
+	bch2_trans_put(trans);
+}
+
+static void btree_interior_update_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, btree_interior_update_work);
+	struct btree_update *as;
+
+	while (1) {
+		mutex_lock(&c->btree_interior_update_lock);
+		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+					      struct btree_update, unwritten_list);
+		if (as && !as->nodes_written)
+			as = NULL;
+		mutex_unlock(&c->btree_interior_update_lock);
+
+		if (!as)
+			break;
+
+		btree_update_nodes_written(as);
+	}
+}
+
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
+{
+	closure_type(as, struct btree_update, cl);
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	as->nodes_written = true;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
+}
+
+/*
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
+ * block @b from being written until @as completes
+ */
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(!btree_node_dirty(b));
+	BUG_ON(!b->c.level);
+
+	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
+	as->b		= b;
+
+	set_btree_node_write_blocked(b);
+	list_add(&as->write_blocked_list, &b->write_blocked);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static void btree_update_reparent(struct btree_update *as,
+				  struct btree_update *child)
+{
+	struct bch_fs *c = as->c;
+
+	lockdep_assert_held(&c->btree_interior_update_lock);
+
+	child->b = NULL;
+	child->mode = BTREE_INTERIOR_UPDATING_AS;
+
+	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+}
+
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
+{
+	struct bkey_i *insert = &b->key;
+	struct bch_fs *c = as->c;
+
+	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+{
+	struct bch_fs *c = as->c;
+
+	closure_get(&as->cl);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
+	BUG_ON(b->will_make_reachable);
+
+	as->new_nodes[as->nr_new_nodes++] = b;
+	b->will_make_reachable = 1UL|(unsigned long) as;
+	set_btree_node_will_make_reachable(b);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	btree_update_add_key(as, &as->new_keys, b);
+
+	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
+		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
+
+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
+			cpu_to_le16(sectors);
+	}
+}
+
+/*
+ * returns true if @b was a new node
+ */
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
+{
+	struct btree_update *as;
+	unsigned long v;
+	unsigned i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	/*
+	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+	 * dropped when it gets written by bch2_btree_complete_write - the
+	 * xchg() is for synchronization with bch2_btree_complete_write:
+	 */
+	v = xchg(&b->will_make_reachable, 0);
+	clear_btree_node_will_make_reachable(b);
+	as = (struct btree_update *) (v & ~1UL);
+
+	if (!as) {
+		mutex_unlock(&c->btree_interior_update_lock);
+		return;
+	}
+
+	for (i = 0; i < as->nr_new_nodes; i++)
+		if (as->new_nodes[i] == b)
+			goto found;
+
+	BUG();
+found:
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	if (v & 1)
+		closure_put(&as->cl);
+}
+
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+{
+	while (b->ob.nr)
+		as->open_buckets[as->nr_open_buckets++] =
+			b->ob.v[--b->ob.nr];
+}
+
+/*
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
+ * nodes and thus outstanding btree_updates - redirect @b's
+ * btree_updates to point to this btree_update:
+ */
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+						      struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree_update *p, *n;
+	struct btree_write *w;
+
+	set_btree_node_dying(b);
+
+	if (btree_node_fake(b))
+		return;
+
+	mutex_lock(&c->btree_interior_update_lock);
+
+	/*
+	 * Does this node have any btree_update operations preventing
+	 * it from being written?
+	 *
+	 * If so, redirect them to point to this btree_update: we can
+	 * write out our new nodes, but we won't make them visible until those
+	 * operations complete
+	 */
+	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
+		list_del_init(&p->write_blocked_list);
+		btree_update_reparent(as, p);
+
+		/*
+		 * for flush_held_btree_writes() waiting on updates to flush or
+		 * nodes to be writeable:
+		 */
+		closure_wake_up(&c->btree_interior_update_wait);
+	}
+
+	clear_btree_node_dirty_acct(c, b);
+	clear_btree_node_need_write(b);
+	clear_btree_node_write_blocked(b);
+
+	/*
+	 * Does this node have unwritten data that has a pin on the journal?
+	 *
+	 * If so, transfer that pin to the btree_update operation -
+	 * note that if we're freeing multiple nodes, we only need to keep the
+	 * oldest pin of any of the nodes we're freeing. We'll release the pin
+	 * when the new nodes are persistent and reachable on disk:
+	 */
+	w = btree_current_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	w = btree_prev_write(b);
+	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+	bch2_journal_pin_drop(&c->journal, &w->journal);
+
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * Is this a node that isn't reachable on disk yet?
+	 *
+	 * Nodes that aren't reachable yet have writes blocked until they're
+	 * reachable - now that we've cancelled any pending writes and moved
+	 * things waiting on that write to wait on this update, we can drop this
+	 * node from the list of nodes that the other update is making
+	 * reachable, prior to freeing it:
+	 */
+	btree_update_drop_new_node(c, b);
+
+	btree_update_add_key(as, &as->old_keys, b);
+
+	as->old_nodes[as->nr_old_nodes] = b;
+	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+	as->nr_old_nodes++;
+}
+
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
+{
+	struct bch_fs *c = as->c;
+	u64 start_time = as->start_time;
+
+	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+
+	if (as->took_gc_lock)
+		up_read(&as->c->gc_lock);
+	as->took_gc_lock = false;
+
+	bch2_btree_reserve_put(as, trans);
+
+	continue_at(&as->cl, btree_update_set_nodes_written,
+		    as->c->btree_interior_update_worker);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+			       start_time);
+}
+
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+			unsigned level, bool split, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_update *as;
+	u64 start_time = local_clock();
+	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+		? BCH_DISK_RESERVATION_NOFAIL : 0;
+	unsigned nr_nodes[2] = { 0, 0 };
+	unsigned update_level = level;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
+	int ret = 0;
+	u32 restart_count = trans->restart_count;
+
+	BUG_ON(!path->should_be_locked);
+
+	if (watermark == BCH_WATERMARK_copygc)
+		watermark = BCH_WATERMARK_btree_copygc;
+	if (watermark < BCH_WATERMARK_btree)
+		watermark = BCH_WATERMARK_btree;
+
+	flags &= ~BCH_WATERMARK_MASK;
+	flags |= watermark;
+
+	if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+	    watermark < c->journal.watermark) {
+		struct journal_res res = { 0 };
+
+		ret = drop_locks_do(trans,
+			bch2_journal_res_get(&c->journal, &res, 1,
+					     watermark|JOURNAL_RES_GET_CHECK));
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	while (1) {
+		nr_nodes[!!update_level] += 1 + split;
+		update_level++;
+
+		ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+		if (ret)
+			return ERR_PTR(ret);
+
+		if (!btree_path_node(path, update_level)) {
+			/* Allocating new root? */
+			nr_nodes[1] += split;
+			update_level = BTREE_MAX_DEPTH;
+			break;
+		}
+
+		/*
+		 * Always check for space for two keys, even if we won't have to
+		 * split at prior level - it might have been a merge instead:
+		 */
+		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+						BKEY_BTREE_PTR_U64s_MAX * 2))
+			break;
+
+		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+	}
+
+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
+		lockdep_assert_held(&c->gc_lock);
+	else if (!down_read_trylock(&c->gc_lock)) {
+		ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
+		if (ret) {
+			up_read(&c->gc_lock);
+			return ERR_PTR(ret);
+		}
+	}
+
+	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
+	memset(as, 0, sizeof(*as));
+	closure_init(&as->cl, NULL);
+	as->c		= c;
+	as->start_time	= start_time;
+	as->mode	= BTREE_INTERIOR_NO_UPDATE;
+	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+	as->btree_id	= path->btree_id;
+	as->update_level = update_level;
+	INIT_LIST_HEAD(&as->list);
+	INIT_LIST_HEAD(&as->unwritten_list);
+	INIT_LIST_HEAD(&as->write_blocked_list);
+	bch2_keylist_init(&as->old_keys, as->_old_keys);
+	bch2_keylist_init(&as->new_keys, as->_new_keys);
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->list, &c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	/*
+	 * We don't want to allocate if we're in an error state, that can cause
+	 * deadlock on emergency shutdown due to open buckets getting stuck in
+	 * the btree_reserve_cache after allocator shutdown has cleared it out.
+	 * This check needs to come after adding us to the btree_interior_update
+	 * list but before calling bch2_btree_reserve_get, to synchronize with
+	 * __bch2_fs_read_only().
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	ret = bch2_disk_reservation_get(c, &as->disk_res,
+			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
+			c->opts.metadata_replicas,
+			disk_res_flags);
+	if (ret)
+		goto err;
+
+	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+	if (bch2_err_matches(ret, ENOSPC) ||
+	    bch2_err_matches(ret, ENOMEM)) {
+		struct closure cl;
+
+		/*
+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+		 * flag
+		 */
+		if (bch2_err_matches(ret, ENOSPC) &&
+		    (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+		    watermark != BCH_WATERMARK_reclaim) {
+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
+			goto err;
+		}
+
+		closure_init_stack(&cl);
+
+		do {
+			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
+
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+	}
+
+	if (ret) {
+		trace_and_count(c, btree_reserve_get_fail, trans->fn,
+				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
+		goto err;
+	}
+
+	ret = bch2_trans_relock(trans);
+	if (ret)
+		goto err;
+
+	bch2_trans_verify_not_restarted(trans, restart_count);
+	return as;
+err:
+	bch2_btree_update_free(as, trans);
+	return ERR_PTR(ret);
+}
+
+/* Btree root updates: */
+
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+{
+	/* Root nodes cannot be reaped */
+	mutex_lock(&c->btree_cache.lock);
+	list_del_init(&b->list);
+	mutex_unlock(&c->btree_cache.lock);
+
+	mutex_lock(&c->btree_root_lock);
+	BUG_ON(btree_node_root(c, b) &&
+	       (b->c.level < btree_node_root(c, b)->c.level ||
+		!btree_node_dying(btree_node_root(c, b))));
+
+	bch2_btree_id_root(c, b->c.btree_id)->b = b;
+	mutex_unlock(&c->btree_root_lock);
+
+	bch2_recalc_btree_reserve(c);
+}
+
+static void bch2_btree_set_root(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b)
+{
+	struct bch_fs *c = as->c;
+	struct btree *old;
+
+	trace_and_count(c, btree_node_set_root, c, b);
+
+	old = btree_node_root(c, b);
+
+	/*
+	 * Ensure no one is using the old root while we switch to the
+	 * new root:
+	 */
+	bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+
+	bch2_btree_set_root_inmem(c, b);
+
+	btree_update_updated_root(as, b);
+
+	/*
+	 * Unlock old root after new root is visible:
+	 *
+	 * The new root isn't persistent, but that's ok: we still have
+	 * an intent lock on the new root, and any updates that would
+	 * depend on the new root would have to update the new root.
+	 */
+	bch2_btree_node_unlock_write(trans, path, old);
+}
+
+/* Interior node updates: */
+
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+					struct btree_trans *trans,
+					struct btree_path *path,
+					struct btree *b,
+					struct btree_node_iter *node_iter,
+					struct bkey_i *insert)
+{
+	struct bch_fs *c = as->c;
+	struct bkey_packed *k;
+	struct printbuf buf = PRINTBUF;
+	unsigned long old, new, v;
+
+	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !btree_ptr_sectors_written(insert));
+
+	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+			      btree_node_type(b), WRITE, &buf) ?:
+	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "inserting invalid bkey\n  ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+		prt_printf(&buf, "\n  ");
+		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
+				  btree_node_type(b), WRITE, &buf);
+		bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		dump_stack();
+	}
+
+	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+	       ARRAY_SIZE(as->journal_entries));
+
+	as->journal_u64s +=
+		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+				  BCH_JSET_ENTRY_btree_keys,
+				  b->c.btree_id, b->c.level,
+				  insert, insert->k.u64s);
+
+	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
+	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
+		bch2_btree_node_iter_advance(node_iter, b);
+
+	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
+	set_btree_node_dirty_acct(c, b);
+
+	v = READ_ONCE(b->flags);
+	do {
+		old = new = v;
+
+		new &= ~BTREE_WRITE_TYPE_MASK;
+		new |= BTREE_WRITE_interior;
+		new |= 1 << BTREE_NODE_need_write;
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+	printbuf_exit(&buf);
+}
+
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+				  struct btree_trans *trans,
+				  struct btree_path *path,
+				  struct btree *b,
+				  struct btree_node_iter node_iter,
+				  struct keylist *keys)
+{
+	struct bkey_i *insert = bch2_keylist_front(keys);
+	struct bkey_packed *k;
+
+	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+		;
+
+	while (!bch2_keylist_empty(keys)) {
+		insert = bch2_keylist_front(keys);
+
+		if (bpos_gt(insert->k.p, b->key.k.p))
+			break;
+
+		bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
+		bch2_keylist_pop_front(keys);
+	}
+}
+
+/*
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
+ * node)
+ */
+static void __btree_split_node(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree *b,
+			       struct btree *n[2])
+{
+	struct bkey_packed *k;
+	struct bpos n1_pos = POS_MIN;
+	struct btree_node_iter iter;
+	struct bset *bsets[2];
+	struct bkey_format_state format[2];
+	struct bkey_packed *out[2];
+	struct bkey uk;
+	unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5;
+	struct { unsigned nr_keys, val_u64s; } nr_keys[2];
+	int i;
+
+	memset(&nr_keys, 0, sizeof(nr_keys));
+
+	for (i = 0; i < 2; i++) {
+		BUG_ON(n[i]->nsets != 1);
+
+		bsets[i] = btree_bset_first(n[i]);
+		out[i] = bsets[i]->start;
+
+		SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1);
+		bch2_bkey_format_init(&format[i]);
+	}
+
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
+
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+		uk = bkey_unpack_key(b, k);
+		if (!i)
+			n1_pos = uk.p;
+		bch2_bkey_format_add_key(&format[i], &uk);
+
+		nr_keys[i].nr_keys++;
+		nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k);
+	}
+
+	btree_set_min(n[0], b->data->min_key);
+	btree_set_max(n[0], n1_pos);
+	btree_set_min(n[1], bpos_successor(n1_pos));
+	btree_set_max(n[1], b->data->max_key);
+
+	for (i = 0; i < 2; i++) {
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key);
+		bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key);
+
+		n[i]->data->format = bch2_bkey_format_done(&format[i]);
+
+		unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
+			nr_keys[i].val_u64s;
+		if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+			n[i]->data->format = b->format;
+
+		btree_node_set_format(n[i], n[i]->data->format);
+	}
+
+	u64s = 0;
+	for_each_btree_node_key(b, k, &iter) {
+		if (bkey_deleted(k))
+			continue;
+
+		i = u64s >= n1_u64s;
+		u64s += k->u64s;
+
+		if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k)
+					? &b->format: &bch2_bkey_format_current, k))
+			out[i]->format = KEY_FORMAT_LOCAL_BTREE;
+		else
+			bch2_bkey_unpack(b, (void *) out[i], k);
+
+		out[i]->needs_whiteout = false;
+
+		btree_keys_account_key_add(&n[i]->nr, 0, out[i]);
+		out[i] = bkey_p_next(out[i]);
+	}
+
+	for (i = 0; i < 2; i++) {
+		bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data);
+
+		BUG_ON(!bsets[i]->u64s);
+
+		set_btree_bset_end(n[i], n[i]->set);
+
+		btree_node_reset_sib_u64s(n[i]);
+
+		bch2_verify_btree_nr_keys(n[i]);
+
+		if (b->c.level)
+			btree_node_interior_verify(as->c, n[i]);
+	}
+}
+
+/*
+ * For updates to interior nodes, we've got to do the insert before we split
+ * because the stuff we're inserting has to be inserted atomically. Post split,
+ * the keys might have to go in different nodes and the split would no longer be
+ * atomic.
+ *
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
+ * we do the split (and pick the pivot) - the pivot we pick might be between
+ * nodes that were coalesced, and thus in the middle of a child node post
+ * coalescing:
+ */
+static void btree_split_insert_keys(struct btree_update *as,
+				    struct btree_trans *trans,
+				    struct btree_path *path,
+				    struct btree *b,
+				    struct keylist *keys)
+{
+	if (!bch2_keylist_empty(keys) &&
+	    bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
+		struct btree_node_iter node_iter;
+
+		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
+
+		__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+
+		btree_node_interior_verify(as->c, b);
+	}
+}
+
+static int btree_split(struct btree_update *as, struct btree_trans *trans,
+		       struct btree_path *path, struct btree *b,
+		       struct keylist *keys, unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	struct btree *parent = btree_node_parent(path, b);
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	struct btree_path *path1 = NULL, *path2 = NULL;
+	u64 start_time = local_clock();
+	int ret = 0;
+
+	BUG_ON(!parent && (b != btree_node_root(c, b)));
+	BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
+		struct btree *n[2];
+
+		trace_and_count(c, btree_node_split, c, b);
+
+		n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
+		n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
+
+		__btree_split_node(as, trans, b, n);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			btree_split_insert_keys(as, trans, path, n2, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
+
+		bch2_btree_build_aux_trees(n2);
+		bch2_btree_build_aux_trees(n1);
+
+		bch2_btree_update_add_new_node(as, n1);
+		bch2_btree_update_add_new_node(as, n2);
+		six_unlock_write(&n2->c.lock);
+		six_unlock_write(&n1->c.lock);
+
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path1, n1);
+
+		path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+		six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path2, n2);
+
+		/*
+		 * Note that on recursive parent_keys == keys, so we
+		 * can't start adding new keys to parent_keys before emptying it
+		 * out (which we did with btree_split_insert_keys() above)
+		 */
+		bch2_keylist_add(&as->parent_keys, &n1->key);
+		bch2_keylist_add(&as->parent_keys, &n2->key);
+
+		if (!parent) {
+			/* Depth increases, make a new root */
+			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
+
+			bch2_btree_update_add_new_node(as, n3);
+			six_unlock_write(&n3->c.lock);
+
+			path2->locks_want++;
+			BUG_ON(btree_node_locked(path2, n3->c.level));
+			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
+			mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+			bch2_btree_path_level_init(trans, path2, n3);
+
+			n3->sib_u64s[0] = U16_MAX;
+			n3->sib_u64s[1] = U16_MAX;
+
+			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+		}
+	} else {
+		trace_and_count(c, btree_node_compact, c, b);
+
+		n1 = bch2_btree_node_alloc_replacement(as, trans, b);
+
+		if (keys) {
+			btree_split_insert_keys(as, trans, path, n1, keys);
+			BUG_ON(!bch2_keylist_empty(keys));
+		}
+
+		bch2_btree_build_aux_trees(n1);
+		bch2_btree_update_add_new_node(as, n1);
+		six_unlock_write(&n1->c.lock);
+
+		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+		six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
+		mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+		bch2_btree_path_level_init(trans, path1, n1);
+
+		if (parent)
+			bch2_keylist_add(&as->parent_keys, &n1->key);
+	}
+
+	/* New nodes all written, now make them visible: */
+
+	if (parent) {
+		/* Split a non root node */
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+		if (ret)
+			goto err;
+	} else if (n3) {
+		bch2_btree_set_root(as, trans, path, n3);
+	} else {
+		/* Root filled up but didn't need to be split */
+		bch2_btree_set_root(as, trans, path, n1);
+	}
+
+	if (n3) {
+		bch2_btree_update_get_open_buckets(as, n3);
+		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
+	}
+	if (n2) {
+		bch2_btree_update_get_open_buckets(as, n2);
+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
+	}
+	bch2_btree_update_get_open_buckets(as, n1);
+	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
+
+	/*
+	 * The old node must be freed (in memory) _before_ unlocking the new
+	 * nodes - else another thread could re-acquire a read lock on the old
+	 * node after another thread has locked and updated the new node, thus
+	 * seeing stale data:
+	 */
+	bch2_btree_node_free_inmem(trans, path, b);
+
+	if (n3)
+		bch2_trans_node_add(trans, n3);
+	if (n2)
+		bch2_trans_node_add(trans, n2);
+	bch2_trans_node_add(trans, n1);
+
+	if (n3)
+		six_unlock_intent(&n3->c.lock);
+	if (n2)
+		six_unlock_intent(&n2->c.lock);
+	six_unlock_intent(&n1->c.lock);
+out:
+	if (path2) {
+		__bch2_btree_path_unlock(trans, path2);
+		bch2_path_put(trans, path2, true);
+	}
+	if (path1) {
+		__bch2_btree_path_unlock(trans, path1);
+		bch2_path_put(trans, path1, true);
+	}
+
+	bch2_trans_verify_locks(trans);
+
+	bch2_time_stats_update(&c->times[n2
+			       ? BCH_TIME_btree_node_split
+			       : BCH_TIME_btree_node_compact],
+			       start_time);
+	return ret;
+err:
+	if (n3)
+		bch2_btree_node_free_never_used(as, trans, n3);
+	if (n2)
+		bch2_btree_node_free_never_used(as, trans, n2);
+	bch2_btree_node_free_never_used(as, trans, n1);
+	goto out;
+}
+
+static void
+bch2_btree_insert_keys_interior(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct keylist *keys)
+{
+	struct btree_path *linked;
+
+	__bch2_btree_insert_keys_interior(as, trans, path, b,
+					  path->l[b->c.level].iter, keys);
+
+	btree_update_updated_node(as, b);
+
+	trans_for_each_path_with_node(trans, b, linked)
+		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+	bch2_trans_verify_paths(trans);
+}
+
+/**
+ * bch2_btree_insert_node - insert bkeys into a given btree node
+ *
+ * @as:			btree_update object
+ * @trans:		btree_trans object
+ * @path:		path that points to current node
+ * @b:			node to insert keys into
+ * @keys:		list of keys to insert
+ * @flags:		transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
+ *
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
+ * If a split occurred, this function will return early. This can only happen
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
+ */
+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+				  struct btree_path *path, struct btree *b,
+				  struct keylist *keys, unsigned flags)
+{
+	struct bch_fs *c = as->c;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+	int ret;
+
+	lockdep_assert_held(&c->gc_lock);
+	BUG_ON(!btree_node_intent_locked(path, b->c.level));
+	BUG_ON(!b->c.level);
+	BUG_ON(!as || as->b);
+	bch2_verify_keylist_sorted(keys);
+
+	ret = bch2_btree_node_lock_write(trans, path, &b->c);
+	if (ret)
+		return ret;
+
+	bch2_btree_node_prep_for_write(trans, path, b);
+
+	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+		bch2_btree_node_unlock_write(trans, path, b);
+		goto split;
+	}
+
+	btree_node_interior_verify(c, b);
+
+	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_trans_node_reinit_iter(trans, b);
+
+	bch2_btree_node_unlock_write(trans, path, b);
+
+	btree_node_interior_verify(c, b);
+	return 0;
+split:
+	/*
+	 * We could attempt to avoid the transaction restart, by calling
+	 * bch2_btree_path_upgrade() and allocating more nodes:
+	 */
+	if (b->c.level >= as->update_level) {
+		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
+		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
+	}
+
+	return btree_split(as, trans, path, b, keys, flags);
+}
+
+int bch2_btree_split_leaf(struct btree_trans *trans,
+			  struct btree_path *path,
+			  unsigned flags)
+{
+	struct btree *b = path_l(path)->b;
+	struct btree_update *as;
+	unsigned l;
+	int ret = 0;
+
+	as = bch2_btree_update_start(trans, path, path->level,
+				     true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
+
+	ret = btree_split(as, trans, path, b, NULL, flags);
+	if (ret) {
+		bch2_btree_update_free(as, trans);
+		return ret;
+	}
+
+	bch2_btree_update_done(as, trans);
+
+	for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
+
+	return ret;
+}
+
+int __bch2_foreground_maybe_merge(struct btree_trans *trans,
+				  struct btree_path *path,
+				  unsigned level,
+				  unsigned flags,
+				  enum btree_node_sibling sib)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *sib_path = NULL, *new_path = NULL;
+	struct btree_update *as;
+	struct bkey_format_state new_s;
+	struct bkey_format new_f;
+	struct bkey_i delete;
+	struct btree *b, *m, *n, *prev, *next, *parent;
+	struct bpos sib_pos;
+	size_t sib_u64s;
+	u64 start_time = local_clock();
+	int ret = 0;
+
+	BUG_ON(!path->should_be_locked);
+	BUG_ON(!btree_node_locked(path, level));
+
+	b = path->l[level].b;
+
+	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
+	    (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
+		b->sib_u64s[sib] = U16_MAX;
+		return 0;
+	}
+
+	sib_pos = sib == btree_prev_sib
+		? bpos_predecessor(b->data->min_key)
+		: bpos_successor(b->data->max_key);
+
+	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+				 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+	ret = bch2_btree_path_traverse(trans, sib_path, false);
+	if (ret)
+		goto err;
+
+	btree_path_set_should_be_locked(sib_path);
+
+	m = sib_path->l[level].b;
+
+	if (btree_node_parent(path, b) !=
+	    btree_node_parent(sib_path, m)) {
+		b->sib_u64s[sib] = U16_MAX;
+		goto out;
+	}
+
+	if (sib == btree_prev_sib) {
+		prev = m;
+		next = b;
+	} else {
+		prev = b;
+		next = m;
+	}
+
+	if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
+		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+		bch2_bpos_to_text(&buf1, prev->data->max_key);
+		bch2_bpos_to_text(&buf2, next->data->min_key);
+		bch_err(c,
+			"%s(): btree topology error:\n"
+			"  prev ends at   %s\n"
+			"  next starts at %s",
+			__func__, buf1.buf, buf2.buf);
+		printbuf_exit(&buf1);
+		printbuf_exit(&buf2);
+		bch2_topology_error(c);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_bkey_format_init(&new_s);
+	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+	__bch2_btree_calc_format(&new_s, prev);
+	__bch2_btree_calc_format(&new_s, next);
+	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
+	new_f = bch2_bkey_format_done(&new_s);
+
+	sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) +
+		btree_node_u64s_with_format(m->nr, &m->format, &new_f);
+
+	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
+		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+		sib_u64s /= 2;
+		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
+	}
+
+	sib_u64s = min(sib_u64s, btree_max_u64s(c));
+	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
+	b->sib_u64s[sib] = sib_u64s;
+
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+		goto out;
+
+	parent = btree_node_parent(path, b);
+	as = bch2_btree_update_start(trans, path, level, false,
+				     BTREE_INSERT_NOFAIL|flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto err;
+
+	trace_and_count(c, btree_node_merge, c, b);
+
+	bch2_btree_interior_update_will_free_node(as, b);
+	bch2_btree_interior_update_will_free_node(as, m);
+
+	n = bch2_btree_node_alloc(as, trans, b->c.level);
+
+	SET_BTREE_NODE_SEQ(n->data,
+			   max(BTREE_NODE_SEQ(b->data),
+			       BTREE_NODE_SEQ(m->data)) + 1);
+
+	btree_set_min(n, prev->data->min_key);
+	btree_set_max(n, next->data->max_key);
+
+	n->data->format	 = new_f;
+	btree_node_set_format(n, new_f);
+
+	bch2_btree_sort_into(c, n, prev);
+	bch2_btree_sort_into(c, n, next);
+
+	bch2_btree_build_aux_trees(n);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, new_path, n);
+
+	bkey_init(&delete.k);
+	delete.k.p = prev->key.k.p;
+	bch2_keylist_add(&as->parent_keys, &delete);
+	bch2_keylist_add(&as->parent_keys, &n->key);
+
+	bch2_trans_verify_paths(trans);
+
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	if (ret)
+		goto err_free_update;
+
+	bch2_trans_verify_paths(trans);
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+	bch2_btree_node_free_inmem(trans, path, b);
+	bch2_btree_node_free_inmem(trans, sib_path, m);
+
+	bch2_trans_node_add(trans, n);
+
+	bch2_trans_verify_paths(trans);
+
+	six_unlock_intent(&n->c.lock);
+
+	bch2_btree_update_done(as, trans);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
+err:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_path_put(trans, sib_path, true);
+	bch2_trans_verify_locks(trans);
+	return ret;
+err_free_update:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct btree *b,
+			    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *new_path = NULL;
+	struct btree *n, *parent;
+	struct btree_update *as;
+	int ret;
+
+	flags |= BTREE_INSERT_NOFAIL;
+
+	parent = btree_node_parent(iter->path, b);
+	as = bch2_btree_update_start(trans, iter->path, b->c.level,
+				     false, flags);
+	ret = PTR_ERR_OR_ZERO(as);
+	if (ret)
+		goto out;
+
+	bch2_btree_interior_update_will_free_node(as, b);
+
+	n = bch2_btree_node_alloc_replacement(as, trans, b);
+
+	bch2_btree_build_aux_trees(n);
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, new_path, n);
+
+	trace_and_count(c, btree_node_rewrite, c, b);
+
+	if (parent) {
+		bch2_keylist_add(&as->parent_keys, &n->key);
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent,
+					     &as->parent_keys, flags);
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_set_root(as, trans, iter->path, n);
+	}
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+
+	bch2_btree_node_free_inmem(trans, iter->path, b);
+
+	bch2_trans_node_add(trans, n);
+	six_unlock_intent(&n->c.lock);
+
+	bch2_btree_update_done(as, trans);
+out:
+	if (new_path)
+		bch2_path_put(trans, new_path, true);
+	bch2_trans_downgrade(trans);
+	return ret;
+err:
+	bch2_btree_node_free_never_used(as, trans, n);
+	bch2_btree_update_free(as, trans);
+	goto out;
+}
+
+struct async_btree_rewrite {
+	struct bch_fs		*c;
+	struct work_struct	work;
+	struct list_head	list;
+	enum btree_id		btree_id;
+	unsigned		level;
+	struct bpos		pos;
+	__le64			seq;
+};
+
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+					  struct async_btree_rewrite *a)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct btree *b;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+				  BTREE_MAX_DEPTH, a->level, 0);
+	b = bch2_btree_iter_peek_node(&iter);
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		goto out;
+
+	if (!b || b->data->keys.seq != a->seq) {
+		struct printbuf buf = PRINTBUF;
+
+		if (b)
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		else
+			prt_str(&buf, "(null");
+		bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
+			 __func__, a->seq, buf.buf);
+		printbuf_exit(&buf);
+		goto out;
+	}
+
+	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static void async_btree_node_rewrite_work(struct work_struct *work)
+{
+	struct async_btree_rewrite *a =
+		container_of(work, struct async_btree_rewrite, work);
+	struct bch_fs *c = a->c;
+	int ret;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		      async_btree_node_rewrite_trans(trans, a));
+	if (ret)
+		bch_err_fn(c, ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+	kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+	struct async_btree_rewrite *a;
+	int ret;
+
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (!a) {
+		bch_err(c, "%s: error allocating memory", __func__);
+		return;
+	}
+
+	a->c		= c;
+	a->btree_id	= b->c.btree_id;
+	a->level	= b->c.level;
+	a->pos		= b->key.k.p;
+	a->seq		= b->data->keys.seq;
+	INIT_WORK(&a->work, async_btree_node_rewrite_work);
+
+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+		mutex_lock(&c->pending_node_rewrites_lock);
+		list_add(&a->list, &c->pending_node_rewrites);
+		mutex_unlock(&c->pending_node_rewrites_lock);
+		return;
+	}
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+		if (test_bit(BCH_FS_STARTED, &c->flags)) {
+			bch_err(c, "%s: error getting c->writes ref", __func__);
+			kfree(a);
+			return;
+		}
+
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err_msg(c, ret, "going read-write");
+			kfree(a);
+			return;
+		}
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+	}
+
+	queue_work(c->btree_interior_update_worker, &a->work);
+}
+
+void bch2_do_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+		queue_work(c->btree_interior_update_worker, &a->work);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+void bch2_free_pending_node_rewrites(struct bch_fs *c)
+{
+	struct async_btree_rewrite *a, *n;
+
+	mutex_lock(&c->pending_node_rewrites_lock);
+	list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
+		list_del(&a->list);
+
+		kfree(a);
+	}
+	mutex_unlock(&c->pending_node_rewrites_lock);
+}
+
+static int __bch2_btree_node_update_key(struct btree_trans *trans,
+					struct btree_iter *iter,
+					struct btree *b, struct btree *new_hash,
+					struct bkey_i *new_key,
+					unsigned commit_flags,
+					bool skip_triggers)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter2 = { NULL };
+	struct btree *parent;
+	int ret;
+
+	if (!skip_triggers) {
+		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+					  bkey_i_to_s_c(&b->key), 0);
+		if (ret)
+			return ret;
+
+		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+					  new_key, 0);
+		if (ret)
+			return ret;
+	}
+
+	if (new_hash) {
+		bkey_copy(&new_hash->key, new_key);
+		ret = bch2_btree_node_hash_insert(&c->btree_cache,
+				new_hash, b->c.level, b->c.btree_id);
+		BUG_ON(ret);
+	}
+
+	parent = btree_node_parent(iter->path, b);
+	if (parent) {
+		bch2_trans_copy_iter(&iter2, iter);
+
+		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+				iter2.flags & BTREE_ITER_INTENT,
+				_THIS_IP_);
+
+		BUG_ON(iter2.path->level != b->c.level);
+		BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+
+		btree_path_set_level_up(trans, iter2.path);
+
+		trans->paths_sorted = false;
+
+		ret   = bch2_btree_iter_traverse(&iter2) ?:
+			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
+		if (ret)
+			goto err;
+	} else {
+		BUG_ON(btree_node_root(c, b) != b);
+
+		ret = darray_make_room(&trans->extra_journal_entries,
+				       jset_u64s(new_key->k.u64s));
+		if (ret)
+			return ret;
+
+		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+				  BCH_JSET_ENTRY_btree_root,
+				  b->c.btree_id, b->c.level,
+				  new_key, new_key->k.u64s);
+		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
+	}
+
+	ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
+	if (ret)
+		goto err;
+
+	bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+		bkey_copy(&b->key, new_key);
+		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+		BUG_ON(ret);
+		mutex_unlock(&c->btree_cache.lock);
+	} else {
+		bkey_copy(&b->key, new_key);
+	}
+
+	bch2_btree_node_unlock_write(trans, iter->path, b);
+out:
+	bch2_trans_iter_exit(trans, &iter2);
+	return ret;
+err:
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		bch2_btree_node_hash_remove(&c->btree_cache, b);
+		mutex_unlock(&c->btree_cache.lock);
+	}
+	goto out;
+}
+
+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
+			       struct btree *b, struct bkey_i *new_key,
+			       unsigned commit_flags, bool skip_triggers)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *new_hash = NULL;
+	struct btree_path *path = iter->path;
+	struct closure cl;
+	int ret = 0;
+
+	ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1);
+	if (ret)
+		return ret;
+
+	closure_init_stack(&cl);
+
+	/*
+	 * check btree_ptr_hash_val() after @b is locked by
+	 * btree_iter_traverse():
+	 */
+	if (btree_ptr_hash_val(new_key) != b->hash_val) {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		if (ret) {
+			ret = drop_locks_do(trans, (closure_sync(&cl), 0));
+			if (ret)
+				return ret;
+		}
+
+		new_hash = bch2_btree_node_mem_alloc(trans, false);
+	}
+
+	path->intent_ref++;
+	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key,
+					   commit_flags, skip_triggers);
+	--path->intent_ref;
+
+	if (new_hash) {
+		mutex_lock(&c->btree_cache.lock);
+		list_move(&new_hash->list, &c->btree_cache.freeable);
+		mutex_unlock(&c->btree_cache.lock);
+
+		six_unlock_write(&new_hash->c.lock);
+		six_unlock_intent(&new_hash->c.lock);
+	}
+	closure_sync(&cl);
+	bch2_btree_cache_cannibalize_unlock(c);
+	return ret;
+}
+
+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
+					struct btree *b, struct bkey_i *new_key,
+					unsigned commit_flags, bool skip_triggers)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+				  BTREE_MAX_DEPTH, b->c.level,
+				  BTREE_ITER_INTENT);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	/* has node been freed? */
+	if (iter.path->l[b->c.level].b != b) {
+		/* node has been freed: */
+		BUG_ON(!btree_node_dying(b));
+		goto out;
+	}
+
+	BUG_ON(!btree_node_hashed(b));
+
+	struct bch_extent_ptr *ptr;
+	bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
+			    !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
+
+	ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
+					 commit_flags, skip_triggers);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* Init code: */
+
+/*
+ * Only for filesystem bringup, when first reading the btree roots or allocating
+ * btree roots when initializing a new filesystem:
+ */
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
+{
+	BUG_ON(btree_node_root(c, b));
+
+	bch2_btree_set_root_inmem(c, b);
+}
+
+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+{
+	struct bch_fs *c = trans->c;
+	struct closure cl;
+	struct btree *b;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+		closure_sync(&cl);
+	} while (ret);
+
+	b = bch2_btree_node_mem_alloc(trans, false);
+	bch2_btree_cache_cannibalize_unlock(c);
+
+	set_btree_node_fake(b);
+	set_btree_node_need_rewrite(b);
+	b->c.level	= 0;
+	b->c.btree_id	= id;
+
+	bkey_btree_ptr_init(&b->key);
+	b->key.k.p = SPOS_MAX;
+	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
+
+	bch2_bset_init_first(b, &b->data->keys);
+	bch2_btree_build_aux_trees(b);
+
+	b->data->flags = 0;
+	btree_set_min(b, POS_MIN);
+	btree_set_max(b, SPOS_MAX);
+	b->data->format = bch2_btree_calc_format(b);
+	btree_node_set_format(b, b->data->format);
+
+	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
+					  b->c.level, b->c.btree_id);
+	BUG_ON(ret);
+
+	bch2_btree_set_root_inmem(c, b);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+	return 0;
+}
+
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+{
+	bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+}
+
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct btree_update *as;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each_entry(as, &c->btree_interior_update_list, list)
+		prt_printf(out, "%p m %u w %u r %u j %llu\n",
+		       as,
+		       as->mode,
+		       as->nodes_written,
+		       closure_nr_remaining(&as->cl),
+		       as->journal.seq);
+	mutex_unlock(&c->btree_interior_update_lock);
+}
+
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	ret = !list_empty(&c->btree_interior_update_list);
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return ret;
+}
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+	bool ret = bch2_btree_interior_updates_pending(c);
+
+	if (ret)
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_pending(c));
+	return ret;
+}
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry)
+{
+	struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
+
+	mutex_lock(&c->btree_root_lock);
+
+	r->level = entry->level;
+	r->alive = true;
+	bkey_copy(&r->key, (struct bkey_i *) entry->start);
+
+	mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+				    struct jset_entry *end,
+				    unsigned long skip)
+{
+	unsigned i;
+
+	mutex_lock(&c->btree_root_lock);
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (r->alive && !test_bit(i, &skip)) {
+			journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
+					  i, r->level, &r->key, r->key.k.u64s);
+			end = vstruct_next(end);
+		}
+	}
+
+	mutex_unlock(&c->btree_root_lock);
+
+	return end;
+}
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
+{
+	if (c->btree_interior_update_worker)
+		destroy_workqueue(c->btree_interior_update_worker);
+	mempool_exit(&c->btree_interior_update_pool);
+}
+
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
+{
+	mutex_init(&c->btree_reserve_cache_lock);
+	INIT_LIST_HEAD(&c->btree_interior_update_list);
+	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
+	mutex_init(&c->btree_interior_update_lock);
+	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+	INIT_LIST_HEAD(&c->pending_node_rewrites);
+	mutex_init(&c->pending_node_rewrites_lock);
+}
+
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
+{
+	c->btree_interior_update_worker =
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+	if (!c->btree_interior_update_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
+	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+				      sizeof(struct btree_update)))
+		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
new file mode 100644
index 000000000000..a6668992a272
--- /dev/null
+++ b/fs/bcachefs/btree_update_interior.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+#define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
+
+#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+	u64				start_time;
+
+	struct list_head		list;
+	struct list_head		unwritten_list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	unsigned			nodes_written:1;
+	unsigned			took_gc_lock:1;
+
+	enum btree_id			btree_id;
+	unsigned			update_level;
+
+	struct disk_reservation		disk_res;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	/* Preallocated nodes we reserve when we start the update: */
+	struct prealloc_nodes {
+		struct btree		*b[BTREE_UPDATE_NODES_MAX];
+		unsigned		nr;
+	}				prealloc_nodes[2];
+
+	/* Nodes being freed: */
+	struct keylist			old_keys;
+	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
+
+	/* Nodes being added: */
+	struct keylist			new_keys;
+	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
+						  BKEY_BTREE_PTR_U64s_MAX];
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_new_nodes;
+
+	struct btree			*old_nodes[BTREE_UPDATE_NODES_MAX];
+	__le64				old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+	unsigned			nr_old_nodes;
+
+	open_bucket_idx_t		open_buckets[BTREE_UPDATE_NODES_MAX *
+						     BCH_REPLICAS_MAX];
+	open_bucket_idx_t		nr_open_buckets;
+
+	unsigned			journal_u64s;
+	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree_trans *,
+						  struct btree *,
+						  struct bkey_format);
+
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+				  unsigned, unsigned, enum btree_node_sibling);
+
+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
+					struct btree_path *path,
+					unsigned level, unsigned flags,
+					enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	EBUG_ON(!btree_node_locked(path, level));
+
+	b = path->l[level].b;
+	if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
+		return 0;
+
+	return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+}
+
+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
+					      struct btree_path *path,
+					      unsigned level,
+					      unsigned flags)
+{
+	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_prev_sib) ?:
+		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
+						    btree_next_sib);
+}
+
+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
+			    struct btree *, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
+			       struct btree *, struct bkey_i *,
+			       unsigned, bool);
+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
+					struct bkey_i *, unsigned, bool);
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->c.level + 1;
+
+	/*
+	 * Number of nodes we might have to allocate in a worst case btree
+	 * split operation - we split all the way up to the root, then allocate
+	 * a new root, unless we're already at max depth:
+	 */
+	if (depth < BTREE_MAX_DEPTH)
+		return (depth - b->c.level) * 2 + 1;
+	else
+		return (depth - b->c.level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+	return p < write_block(b);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return __btree_addr_written(b, i);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
+{
+	return __btree_addr_written(b, k);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
+{
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size >> 3;
+
+	/* Always leave one extra u64 for bch2_varint_decode: */
+	used++;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
+}
+
+#define BTREE_WRITE_SET_U64s_BITS	9
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 8 << BTREE_WRITE_SET_U64s_BITS;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset_tree *t = bset_tree_last(b);
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, bne->keys.start);
+
+	if (unlikely(bset_written(b, bset(b, t)))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}
+
+	return NULL;
+}
+
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
+				 struct bpos pos)
+{
+	struct bkey_packed k;
+
+	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+	EBUG_ON(btree_node_just_written(b));
+
+	if (!bkey_pack_pos(&k, pos, b)) {
+		struct bkey *u = (void *) &k;
+
+		bkey_init(u);
+		u->p = pos;
+	}
+
+	k.needs_whiteout = true;
+
+	b->whiteout_u64s += k.u64s;
+	bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					       struct btree *b, unsigned u64s)
+{
+	if (unlikely(btree_node_need_rewrite(b)))
+		return false;
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
+
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
+
+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+					struct jset_entry *, unsigned long);
+
+void bch2_do_pending_node_rewrites(struct bch_fs *);
+void bch2_free_pending_node_rewrites(struct bch_fs *);
+
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
+void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..4e6241db518b
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->btree, r->btree) ?:
+		bpos_cmp(l->k.k.p, r->k.k.p) ?:
+		cmp_int(l->journal_seq, r->journal_seq) ?:
+		cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return  cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+					     struct btree_iter *iter,
+					     struct btree_write_buffered_key *wb,
+					     unsigned commit_flags,
+					     bool *write_locked,
+					     size_t *fast)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_path *path;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	path = iter->path;
+
+	if (!*write_locked) {
+		ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+		if (ret)
+			return ret;
+
+		bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+		*write_locked = true;
+	}
+
+	if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+		goto trans_commit;
+	}
+
+	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+	(*fast)++;
+
+	if (path->ref > 1) {
+		/*
+		 * We can't clone a path that has write locks: if the path is
+		 * shared, unlock before set_pos(), traverse():
+		 */
+		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+		*write_locked = false;
+	}
+	return 0;
+trans_commit:
+	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  commit_flags|
+				  BTREE_INSERT_NOCHECK_RW|
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+	union btree_write_buffer_state old, new;
+	u64 v = READ_ONCE(wb->state.v);
+
+	do {
+		old.v = new.v = v;
+
+		new.nr = 0;
+		new.idx++;
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+		cpu_relax();
+
+	smp_mb();
+
+	return old;
+}
+
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+			  struct btree_write_buffered_key *wb)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+			     BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+				    bool locked)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct journal_entry_pin pin;
+	struct btree_write_buffered_key *i, *keys;
+	struct btree_iter iter = { NULL };
+	size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+	bool write_locked = false;
+	union btree_write_buffer_state s;
+	int ret = 0;
+
+	memset(&pin, 0, sizeof(pin));
+
+	if (!locked && !mutex_trylock(&wb->flush_lock))
+		return 0;
+
+	bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+	bch2_journal_pin_drop(j, &wb->journal_pin);
+
+	s = btree_write_buffer_switch(wb);
+	keys = wb->keys[s.idx];
+	nr = s.nr;
+
+	if (race_fault())
+		goto slowpath;
+
+	/*
+	 * We first sort so that we can detect and skip redundant updates, and
+	 * then we attempt to flush in sorted btree order, as this is most
+	 * efficient.
+	 *
+	 * However, since we're not flushing in the order they appear in the
+	 * journal we won't be able to drop our journal pin until everything is
+	 * flushed - which means this could deadlock the journal if we weren't
+	 * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+	 * if it would block taking a journal reservation.
+	 *
+	 * If that happens, simply skip the key so we can optimistically insert
+	 * as many keys as possible in the fast path.
+	 */
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_key_cmp, NULL);
+
+	for (i = keys; i < keys + nr; i++) {
+		if (i + 1 < keys + nr &&
+		    i[0].btree == i[1].btree &&
+		    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+			skipped++;
+			i->journal_seq = 0;
+			continue;
+		}
+
+		if (write_locked &&
+		    (iter.path->btree_id != i->btree ||
+		     bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+			bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+			write_locked = false;
+		}
+
+		if (!iter.path || iter.path->btree_id != i->btree) {
+			bch2_trans_iter_exit(trans, &iter);
+			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+					     BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
+		}
+
+		bch2_btree_iter_set_pos(&iter, i->k.k.p);
+		iter.path->preserve = false;
+
+		do {
+			ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+						commit_flags, &write_locked, &fast);
+			if (!write_locked)
+				bch2_trans_begin(trans);
+		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+		if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+			slowpath++;
+			continue;
+		}
+		if (ret)
+			break;
+
+		i->journal_seq = 0;
+	}
+
+	if (write_locked)
+		bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+	bch2_trans_iter_exit(trans, &iter);
+
+	trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+	if (slowpath)
+		goto slowpath;
+
+	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+	bch2_journal_pin_drop(j, &pin);
+	mutex_unlock(&wb->flush_lock);
+	return ret;
+slowpath:
+	trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+	/*
+	 * Now sort the rest by journal seq and bump the journal pin as we go.
+	 * The slowpath zapped the seq of keys that were successfully flushed so
+	 * we can skip those here.
+	 */
+	sort(keys, nr, sizeof(keys[0]),
+	     btree_write_buffered_journal_cmp,
+	     NULL);
+
+	commit_flags &= ~BCH_WATERMARK_MASK;
+	commit_flags |= BCH_WATERMARK_reclaim;
+
+	for (i = keys; i < keys + nr; i++) {
+		if (!i->journal_seq)
+			continue;
+
+		if (i->journal_seq > pin.seq) {
+			struct journal_entry_pin pin2;
+
+			memset(&pin2, 0, sizeof(pin2));
+
+			bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin);
+			bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+			bch2_journal_pin_drop(j, &pin2);
+		}
+
+		ret = commit_do(trans, NULL, NULL,
+				commit_flags|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_JOURNAL_RECLAIM,
+				btree_write_buffered_insert(trans, i));
+		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+			break;
+	}
+
+	goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+	bch2_trans_unlock(trans);
+	mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+	return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+	return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+				struct journal_entry_pin *_pin, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_lock(&wb->flush_lock);
+
+	return bch2_trans_run(c,
+			__bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+	return ((union btree_write_buffer_state) {
+		.ref0 = idx == 0,
+		.ref1 = idx == 1,
+	}).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	struct btree_write_buffered_key *i;
+	union btree_write_buffer_state old, new;
+	int ret = 0;
+	u64 v;
+
+	trans_for_each_wb_update(trans, i) {
+		EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+		i->journal_seq		= trans->journal_res.seq;
+		i->journal_offset	= trans->journal_res.offset;
+	}
+
+	preempt_disable();
+	v = READ_ONCE(wb->state.v);
+	do {
+		old.v = new.v = v;
+
+		new.v += btree_write_buffer_ref(new.idx);
+		new.nr += trans->nr_wb_updates;
+		if (new.nr > wb->size) {
+			ret = -BCH_ERR_btree_insert_need_flush_buffer;
+			goto out;
+		}
+	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+	memcpy(wb->keys[new.idx] + old.nr,
+	       trans->wb_updates,
+	       sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+	bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+	preempt_enable();
+	return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+	kvfree(wb->keys[1]);
+	kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	mutex_init(&wb->flush_lock);
+	wb->size = c->opts.btree_write_buffer_size;
+
+	wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+	wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+	if (!wb->keys[0] || !wb->keys[1])
+		return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
+#define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+	u64			journal_seq;
+	unsigned		journal_offset;
+	enum btree_id		btree;
+	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64			nr:23;
+		u64			idx:1;
+		u64			ref0:20;
+		u64			ref1:20;
+	};
+};
+
+struct btree_write_buffer {
+	struct mutex			flush_lock;
+	struct journal_entry_pin	journal_pin;
+
+	union btree_write_buffer_state	state;
+	size_t				size;
+
+	struct btree_write_buffered_key	*keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
new file mode 100644
index 000000000000..5a91d3189fcf
--- /dev/null
+++ b/fs/bcachefs/buckets.c
@@ -0,0 +1,2170 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
+#include "ec.h"
+#include "error.h"
+#include "inode.h"
+#include "movinggc.h"
+#include "recovery.h"
+#include "reflink.h"
+#include "replicas.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+					      enum bch_data_type data_type,
+					      s64 sectors)
+{
+	switch (data_type) {
+	case BCH_DATA_btree:
+		fs_usage->btree		+= sectors;
+		break;
+	case BCH_DATA_user:
+	case BCH_DATA_parity:
+		fs_usage->data		+= sectors;
+		break;
+	case BCH_DATA_cached:
+		fs_usage->cached	+= sectors;
+		break;
+	default:
+		break;
+	}
+}
+
+void bch2_fs_usage_initialize(struct bch_fs *c)
+{
+	struct bch_fs_usage *usage;
+	struct bch_dev *ca;
+	unsigned i;
+
+	percpu_down_write(&c->mark_lock);
+	usage = c->usage_base;
+
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		usage->reserved += usage->persistent_reserved[i];
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+				  dev.d[BCH_DATA_journal].buckets) *
+			ca->mi.bucket_size;
+	}
+
+	percpu_up_write(&c->mark_lock);
+}
+
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+						  unsigned journal_seq,
+						  bool gc)
+{
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? ca->usage_gc
+			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
+{
+	struct bch_fs *c = ca->fs;
+	unsigned seq, i, u64s = dev_usage_u64s();
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
+{
+	ssize_t offset = v - (u64 *) c->usage_base;
+	unsigned i, seq;
+	u64 ret;
+
+	BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
+	percpu_rwsem_assert_held(&c->mark_lock);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		ret = *v;
+
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
+{
+	struct bch_fs_usage_online *ret;
+	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+	unsigned seq, i;
+retry:
+	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
+	if (unlikely(!ret))
+		return NULL;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (nr_replicas != c->replicas.nr) {
+		nr_replicas = c->replicas.nr;
+		percpu_up_read(&c->mark_lock);
+		kfree(ret);
+		goto retry;
+	}
+
+	ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		unsafe_memcpy(&ret->u, c->usage_base,
+			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
+			      "embedded variable length struct");
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+					__fs_usage_u64s(nr_replicas));
+	} while (read_seqcount_retry(&c->usage_lock, seq));
+
+	return ret;
+}
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
+{
+	struct bch_dev *ca;
+	unsigned i, u64s = fs_usage_u64s(c);
+
+	BUG_ON(idx >= ARRAY_SIZE(c->usage));
+
+	preempt_disable();
+	write_seqcount_begin(&c->usage_lock);
+
+	acc_u64s_percpu((u64 *) c->usage_base,
+			(u64 __percpu *) c->usage[idx], u64s);
+	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		u64s = dev_usage_u64s();
+
+		acc_u64s_percpu((u64 *) ca->usage_base,
+				(u64 __percpu *) ca->usage[idx], u64s);
+		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+	}
+	rcu_read_unlock();
+
+	write_seqcount_end(&c->usage_lock);
+	preempt_enable();
+}
+
+void bch2_fs_usage_to_text(struct printbuf *out,
+			   struct bch_fs *c,
+			   struct bch_fs_usage_online *fs_usage)
+{
+	unsigned i;
+
+	prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
+
+	prt_printf(out, "hidden:\t\t\t\t%llu\n",
+	       fs_usage->u.hidden);
+	prt_printf(out, "data:\t\t\t\t%llu\n",
+	       fs_usage->u.data);
+	prt_printf(out, "cached:\t\t\t\t%llu\n",
+	       fs_usage->u.cached);
+	prt_printf(out, "reserved:\t\t\t%llu\n",
+	       fs_usage->u.reserved);
+	prt_printf(out, "nr_inodes:\t\t\t%llu\n",
+	       fs_usage->u.nr_inodes);
+	prt_printf(out, "online reserved:\t\t%llu\n",
+	       fs_usage->online_reserved);
+
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
+	     i++) {
+		prt_printf(out, "%u replicas:\n", i + 1);
+		prt_printf(out, "\treserved:\t\t%llu\n",
+		       fs_usage->u.persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		prt_printf(out, "\t");
+		bch2_replicas_entry_to_text(out, e);
+		prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
+	}
+}
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
+{
+	return min(fs_usage->u.hidden +
+		   fs_usage->u.btree +
+		   fs_usage->u.data +
+		   reserve_factor(fs_usage->u.reserved +
+				  fs_usage->online_reserved),
+		   c->capacity);
+}
+
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+	u64 data, reserved;
+
+	ret.capacity = c->capacity -
+		bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+
+	data		= bch2_fs_usage_read_one(c, &c->usage_base->data) +
+		bch2_fs_usage_read_one(c, &c->usage_base->btree);
+	reserved	= bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+		percpu_u64_get(c->online_reserved);
+
+	ret.used	= min(ret.capacity, data + reserve_factor(reserved));
+	ret.free	= ret.capacity - ret.used;
+
+	ret.nr_inodes	= bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+
+	return ret;
+}
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *c)
+{
+	struct bch_fs_usage_short ret;
+
+	percpu_down_read(&c->mark_lock);
+	ret = __bch2_fs_usage_read_short(c);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *ca)
+{
+	ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+}
+
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+					    struct bch_alloc_v4 a)
+{
+	return a.dirty_sectors
+		? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
+		: 0;
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bch_alloc_v4 old,
+				  struct bch_alloc_v4 new,
+				  u64 journal_seq, bool gc)
+{
+	struct bch_fs_usage *fs_usage;
+	struct bch_dev_usage *u;
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+
+	if (data_type_is_hidden(old.data_type))
+		fs_usage->hidden -= ca->mi.bucket_size;
+	if (data_type_is_hidden(new.data_type))
+		fs_usage->hidden += ca->mi.bucket_size;
+
+	u = dev_usage_ptr(ca, journal_seq, gc);
+
+	u->d[old.data_type].buckets--;
+	u->d[new.data_type].buckets++;
+
+	u->buckets_ec -= (int) !!old.stripe;
+	u->buckets_ec += (int) !!new.stripe;
+
+	u->d[old.data_type].sectors -= old.dirty_sectors;
+	u->d[new.data_type].sectors += new.dirty_sectors;
+
+	u->d[BCH_DATA_cached].sectors += new.cached_sectors;
+	u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+
+	u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
+	preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket old, struct bucket new,
+				    u64 journal_seq, bool gc)
+{
+	struct bch_alloc_v4 old_a = {
+		.gen		= old.gen,
+		.data_type	= old.data_type,
+		.dirty_sectors	= old.dirty_sectors,
+		.cached_sectors	= old.cached_sectors,
+		.stripe		= old.stripe,
+	};
+	struct bch_alloc_v4 new_a = {
+		.gen		= new.gen,
+		.data_type	= new.data_type,
+		.dirty_sectors	= new.dirty_sectors,
+		.cached_sectors	= new.cached_sectors,
+		.stripe		= new.stripe,
+	};
+
+	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+}
+
+static inline int __update_replicas(struct bch_fs *c,
+				    struct bch_fs_usage *fs_usage,
+				    struct bch_replicas_entry *r,
+				    s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0)
+		return -1;
+
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+	fs_usage->replicas[idx]		+= sectors;
+	return 0;
+}
+
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
+			struct bch_replicas_entry *r, s64 sectors,
+			unsigned journal_seq, bool gc)
+{
+	struct bch_fs_usage *fs_usage;
+	int idx, ret = 0;
+	struct printbuf buf = PRINTBUF;
+
+	percpu_down_read(&c->mark_lock);
+
+	idx = bch2_replicas_entry_idx(c, r);
+	if (idx < 0 &&
+	    fsck_err(c, ptr_to_missing_replicas_entry,
+		     "no replicas entry\n  while marking %s",
+		     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		percpu_up_read(&c->mark_lock);
+		ret = bch2_mark_replicas(c, r);
+		percpu_down_read(&c->mark_lock);
+
+		if (ret)
+			goto err;
+		idx = bch2_replicas_entry_idx(c, r);
+	}
+	if (idx < 0) {
+		ret = -1;
+		goto err;
+	}
+
+	preempt_disable();
+	fs_usage = fs_usage_ptr(c, journal_seq, gc);
+	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+	fs_usage->replicas[idx]		+= sectors;
+	preempt_enable();
+err:
+fsck_err:
+	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static inline int update_cached_sectors(struct bch_fs *c,
+			struct bkey_s_c k,
+			unsigned dev, s64 sectors,
+			unsigned journal_seq, bool gc)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+}
+
+static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
+				     gfp_t gfp)
+{
+	struct replicas_delta_list *d = trans->fs_usage_deltas;
+	unsigned new_size = d ? (d->size + more) * 2 : 128;
+	unsigned alloc_size = sizeof(*d) + new_size;
+
+	WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
+
+	if (!d || d->used + more > d->size) {
+		d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
+
+		if (unlikely(!d)) {
+			if (alloc_size > REPLICAS_DELTA_LIST_MAX)
+				return -ENOMEM;
+
+			d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
+			if (!d)
+				return -ENOMEM;
+
+			memset(d, 0, REPLICAS_DELTA_LIST_MAX);
+
+			if (trans->fs_usage_deltas)
+				memcpy(d, trans->fs_usage_deltas,
+				       trans->fs_usage_deltas->size + sizeof(*d));
+
+			new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
+			kfree(trans->fs_usage_deltas);
+		}
+
+		d->size = new_size;
+		trans->fs_usage_deltas = d;
+	}
+
+	return 0;
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
+{
+	return allocate_dropping_locks_errcode(trans,
+				__replicas_deltas_realloc(trans, more, _gfp));
+}
+
+static inline int update_replicas_list(struct btree_trans *trans,
+					struct bch_replicas_entry *r,
+					s64 sectors)
+{
+	struct replicas_delta_list *d;
+	struct replicas_delta *n;
+	unsigned b;
+	int ret;
+
+	if (!sectors)
+		return 0;
+
+	b = replicas_entry_bytes(r) + 8;
+	ret = bch2_replicas_deltas_realloc(trans, b);
+	if (ret)
+		return ret;
+
+	d = trans->fs_usage_deltas;
+	n = (void *) d->d + d->used;
+	n->delta = sectors;
+	unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+		      r, replicas_entry_bytes(r),
+		      "flexible array member embedded in strcuct with padding");
+	bch2_replicas_entry_sort(&n->r);
+	d->used += b;
+	return 0;
+}
+
+static inline int update_cached_sectors_list(struct btree_trans *trans,
+					      unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	return update_replicas_list(trans, &r.e, sectors);
+}
+
+int bch2_mark_alloc(struct btree_trans *trans,
+		    enum btree_id btree, unsigned level,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
+{
+	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	u64 bucket_journal_seq;
+	struct bch_fs *c = trans->c;
+	struct bch_alloc_v4 old_a_convert, new_a_convert;
+	const struct bch_alloc_v4 *old_a, *new_a;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	/*
+	 * alloc btree is read in by bch2_alloc_read, not gc:
+	 */
+	if ((flags & BTREE_TRIGGER_GC) &&
+	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
+		return 0;
+
+	if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+				       "alloc key for invalid device or bucket"))
+		return -EIO;
+
+	ca = bch_dev_bkey_exists(c, new.k->p.inode);
+
+	old_a = bch2_alloc_to_v4(old, &old_a_convert);
+	new_a = bch2_alloc_to_v4(new, &new_a_convert);
+
+	bucket_journal_seq = new_a->journal_seq;
+
+	if ((flags & BTREE_TRIGGER_INSERT) &&
+	    data_type_is_empty(old_a->data_type) !=
+	    data_type_is_empty(new_a->data_type) &&
+	    new.k->type == KEY_TYPE_alloc_v4) {
+		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
+
+		EBUG_ON(!journal_seq);
+
+		/*
+		 * If the btree updates referring to a bucket weren't flushed
+		 * before the bucket became empty again, then the we don't have
+		 * to wait on a journal flush before we can reuse the bucket:
+		 */
+		v->journal_seq = bucket_journal_seq =
+			data_type_is_empty(new_a->data_type) &&
+			(journal_seq == v->journal_seq ||
+			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+			? 0 : journal_seq;
+	}
+
+	if (!data_type_is_empty(old_a->data_type) &&
+	    data_type_is_empty(new_a->data_type) &&
+	    bucket_journal_seq) {
+		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+				c->journal.flushed_seq_ondisk,
+				new.k->p.inode, new.k->p.offset,
+				bucket_journal_seq);
+		if (ret) {
+			bch2_fs_fatal_error(c,
+				"error setting bucket_needs_journal_commit: %i", ret);
+			return ret;
+		}
+	}
+
+	percpu_down_read(&c->mark_lock);
+	if (!gc && new_a->gen != old_a->gen)
+		*bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+	bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
+
+	if (gc) {
+		struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+		bucket_lock(g);
+
+		g->gen_valid		= 1;
+		g->gen			= new_a->gen;
+		g->data_type		= new_a->data_type;
+		g->stripe		= new_a->stripe;
+		g->stripe_redundancy	= new_a->stripe_redundancy;
+		g->dirty_sectors	= new_a->dirty_sectors;
+		g->cached_sectors	= new_a->cached_sectors;
+
+		bucket_unlock(g);
+	}
+	percpu_up_read(&c->mark_lock);
+
+	/*
+	 * need to know if we're getting called from the invalidate path or
+	 * not:
+	 */
+
+	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+	    old_a->cached_sectors) {
+		ret = update_cached_sectors(c, new, ca->dev_idx,
+					    -((s64) old_a->cached_sectors),
+					    journal_seq, gc);
+		if (ret) {
+			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+					    __func__);
+			return ret;
+		}
+	}
+
+	if (new_a->data_type == BCH_DATA_free &&
+	    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+		closure_wake_up(&c->freelist_wait);
+
+	if (new_a->data_type == BCH_DATA_need_discard &&
+	    (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+		bch2_do_discards(c);
+
+	if (old_a->data_type != BCH_DATA_cached &&
+	    new_a->data_type == BCH_DATA_cached &&
+	    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+		bch2_do_invalidates(c);
+
+	if (new_a->data_type == BCH_DATA_need_gc_gens)
+		bch2_do_gc_gens(c);
+
+	return 0;
+}
+
+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			      size_t b, enum bch_data_type data_type,
+			      unsigned sectors, struct gc_pos pos,
+			      unsigned flags)
+{
+	struct bucket old, new, *g;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+	BUG_ON(data_type != BCH_DATA_sb &&
+	       data_type != BCH_DATA_journal);
+
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return 0;
+
+	percpu_down_read(&c->mark_lock);
+	g = gc_bucket(ca, b);
+
+	bucket_lock(g);
+	old = *g;
+
+	if (bch2_fs_inconsistent_on(g->data_type &&
+			g->data_type != data_type, c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_types[g->data_type],
+			bch2_data_types[data_type])) {
+		ret = -EIO;
+		goto err;
+	}
+
+	if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
+			ca->dev_idx, b, g->gen,
+			bch2_data_types[g->data_type ?: data_type],
+			g->dirty_sectors, sectors)) {
+		ret = -EIO;
+		goto err;
+	}
+
+
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+	new = *g;
+err:
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+	percpu_up_read(&c->mark_lock);
+	return ret;
+}
+
+static int check_bucket_ref(struct btree_trans *trans,
+			    struct bkey_s_c k,
+			    const struct bch_extent_ptr *ptr,
+			    s64 sectors, enum bch_data_type ptr_data_type,
+			    u8 b_gen, u8 bucket_data_type,
+			    u32 dirty_sectors, u32 cached_sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
+	u32 bucket_sectors = !ptr->cached
+		? dirty_sectors
+		: cached_sectors;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (bucket_data_type == BCH_DATA_cached)
+		bucket_data_type = BCH_DATA_user;
+
+	if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
+	    (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
+		bucket_data_type = ptr_data_type = BCH_DATA_stripe;
+
+	if (gen_after(ptr->gen, b_gen)) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_ptr_too_stale,
+			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (b_gen != ptr->gen && !ptr->cached) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_stale_dirty_ptr,
+			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			*bucket_gen(ca, bucket_nr),
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			ptr->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if (b_gen != ptr->gen) {
+		ret = 1;
+		goto out;
+	}
+
+	if (!data_type_is_empty(bucket_data_type) &&
+	    ptr_data_type &&
+	    bucket_data_type != ptr_data_type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
+			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type],
+			bch2_data_types[ptr_data_type],
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+
+	if ((u64) bucket_sectors + sectors > U32_MAX) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_bucket_sector_count_overflow,
+			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
+			"while marking %s",
+			ptr->dev, bucket_nr, b_gen,
+			bch2_data_types[bucket_data_type ?: ptr_data_type],
+			bucket_sectors, sectors,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EIO;
+		goto err;
+	}
+out:
+	printbuf_exit(&buf);
+	return ret;
+err:
+	bch2_dump_trans_updates(trans);
+	goto out;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+			      struct bkey_s_c k,
+			      unsigned ptr_idx,
+			      unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	u64 journal_seq = trans->journal_res.seq;
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+	bool parity = ptr_idx >= nr_data;
+	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket old, new, *g;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	/* * XXX doesn't handle deletion */
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_GC_BUCKET(ca, ptr);
+
+	if (g->dirty_sectors ||
+	    (g->stripe && g->stripe != k.k->p.offset)) {
+		bch2_fs_inconsistent(c,
+			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		ret = -EINVAL;
+		goto err;
+	}
+
+	bucket_lock(g);
+	old = *g;
+
+	ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
+			       g->gen, g->data_type,
+			       g->dirty_sectors, g->cached_sectors);
+	if (ret)
+		goto err;
+
+	g->data_type = data_type;
+	g->dirty_sectors += sectors;
+
+	g->stripe		= k.k->p.offset;
+	g->stripe_redundancy	= s->nr_redundant;
+	new = *g;
+err:
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+	percpu_up_read(&c->mark_lock);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int __mark_pointer(struct btree_trans *trans,
+			  struct bkey_s_c k,
+			  const struct bch_extent_ptr *ptr,
+			  s64 sectors, enum bch_data_type ptr_data_type,
+			  u8 bucket_gen, u8 *bucket_data_type,
+			  u32 *dirty_sectors, u32 *cached_sectors)
+{
+	u32 *dst_sectors = !ptr->cached
+		? dirty_sectors
+		: cached_sectors;
+	int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+				   bucket_gen, *bucket_data_type,
+				   *dirty_sectors, *cached_sectors);
+
+	if (ret)
+		return ret;
+
+	*dst_sectors += sectors;
+
+	if (!*dirty_sectors && !*cached_sectors)
+		*bucket_data_type = 0;
+	else if (*bucket_data_type != BCH_DATA_stripe)
+		*bucket_data_type = ptr_data_type;
+
+	return 0;
+}
+
+static int bch2_mark_pointer(struct btree_trans *trans,
+			     enum btree_id btree_id, unsigned level,
+			     struct bkey_s_c k,
+			     struct extent_ptr_decoded p,
+			     s64 sectors,
+			     unsigned flags)
+{
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+	struct bucket old, new, *g;
+	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+	u8 bucket_data_type;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	percpu_down_read(&c->mark_lock);
+	g = PTR_GC_BUCKET(ca, &p.ptr);
+	bucket_lock(g);
+	old = *g;
+
+	bucket_data_type = g->data_type;
+	ret = __mark_pointer(trans, k, &p.ptr, sectors,
+			     data_type, g->gen,
+			     &bucket_data_type,
+			     &g->dirty_sectors,
+			     &g->cached_sectors);
+	if (!ret)
+		g->data_type = bucket_data_type;
+
+	new = *g;
+	bucket_unlock(g);
+	if (!ret)
+		bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+				struct bkey_s_c k,
+				struct bch_extent_stripe_ptr p,
+				enum bch_data_type data_type,
+				s64 sectors,
+				unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_replicas_padded r;
+	struct gc_stripe *m;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+	if (!m) {
+		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+			(u64) p.idx);
+		return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+	}
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+
+	if (!m || !m->alive) {
+		mutex_unlock(&c->ec_stripes_heap_lock);
+		bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
+				    (u64) p.idx);
+		bch2_inconsistent_error(c);
+		return -EIO;
+	}
+
+	m->block_sectors[p.block] += sectors;
+
+	r = m->r;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	r.e.data_type = data_type;
+	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+
+	return 0;
+}
+
+static int __mark_extent(struct btree_trans *trans,
+			 enum btree_id btree_id, unsigned level,
+			 struct bkey_s_c k, unsigned flags)
+{
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? btree_sectors(c)
+		: k.k->size;
+	s64 dirty_sectors = 0;
+	bool stale;
+	int ret;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
+		ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (!stale) {
+				ret = update_cached_sectors(c, k, p.ptr.dev,
+						disk_sectors, journal_seq, true);
+				if (ret) {
+					bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
+							    __func__);
+					return ret;
+				}
+			}
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
+					disk_sectors, flags);
+			if (ret)
+				return ret;
+
+			/*
+			 * There may be other dirty pointers in this extent, but
+			 * if so they're not required for mounting if we have an
+			 * erasure coded pointer in this extent:
+			 */
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs) {
+		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+			printbuf_exit(&buf);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_mark_extent(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
+{
+	return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
+int bch2_mark_stripe(struct btree_trans *trans,
+		     enum btree_id btree_id, unsigned level,
+		     struct bkey_s_c old, struct bkey_s_c new,
+		     unsigned flags)
+{
+	bool gc = flags & BTREE_TRIGGER_GC;
+	u64 journal_seq = trans->journal_res.seq;
+	struct bch_fs *c = trans->c;
+	u64 idx = new.k->p.offset;
+	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(old).v : NULL;
+	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+		? bkey_s_c_to_stripe(new).v : NULL;
+	unsigned i;
+	int ret;
+
+	BUG_ON(gc && old_s);
+
+	if (!gc) {
+		struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+		if (!m) {
+			struct printbuf buf1 = PRINTBUF;
+			struct printbuf buf2 = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf1, c, old);
+			bch2_bkey_val_to_text(&buf2, c, new);
+			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+					    "old %s\n"
+					    "new %s", idx, buf1.buf, buf2.buf);
+			printbuf_exit(&buf2);
+			printbuf_exit(&buf1);
+			bch2_inconsistent_error(c);
+			return -1;
+		}
+
+		if (!new_s) {
+			bch2_stripes_heap_del(c, m, idx);
+
+			memset(m, 0, sizeof(*m));
+		} else {
+			m->sectors	= le16_to_cpu(new_s->sectors);
+			m->algorithm	= new_s->algorithm;
+			m->nr_blocks	= new_s->nr_blocks;
+			m->nr_redundant	= new_s->nr_redundant;
+			m->blocks_nonempty = 0;
+
+			for (i = 0; i < new_s->nr_blocks; i++)
+				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+			if (!old_s)
+				bch2_stripes_heap_insert(c, m, idx);
+			else
+				bch2_stripes_heap_update(c, m, idx);
+		}
+	} else {
+		struct gc_stripe *m =
+			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+		if (!m) {
+			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+				idx);
+			return -BCH_ERR_ENOMEM_mark_stripe;
+		}
+		/*
+		 * This will be wrong when we bring back runtime gc: we should
+		 * be unmarking the old key and then marking the new key
+		 */
+		m->alive	= true;
+		m->sectors	= le16_to_cpu(new_s->sectors);
+		m->nr_blocks	= new_s->nr_blocks;
+		m->nr_redundant	= new_s->nr_redundant;
+
+		for (i = 0; i < new_s->nr_blocks; i++)
+			m->ptrs[i] = new_s->ptrs[i];
+
+		bch2_bkey_to_replicas(&m->r.e, new);
+
+		/*
+		 * gc recalculates this field from stripe ptr
+		 * references:
+		 */
+		memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			ret = mark_stripe_bucket(trans, new, i, flags);
+			if (ret)
+				return ret;
+		}
+
+		ret = update_replicas(c, new, &m->r.e,
+				      ((s64) m->sectors * m->nr_redundant),
+				      journal_seq, gc);
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, new);
+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			printbuf_exit(&buf);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int __mark_reservation(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c k, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage;
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+
+	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(fs_usage->persistent_reserved));
+
+	fs_usage->reserved				+= sectors;
+	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+
+	return 0;
+}
+
+int bch2_mark_reservation(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old, struct bkey_s_c new,
+			  unsigned flags)
+{
+	return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
+				 struct bkey_s_c_reflink_p p,
+				 u64 start, u64 end,
+				 u64 *idx, unsigned flags, size_t r_idx)
+{
+	struct bch_fs *c = trans->c;
+	struct reflink_gc *r;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	u64 next_idx = end;
+	s64 ret = 0;
+	struct printbuf buf = PRINTBUF;
+
+	if (r_idx >= c->reflink_gc_nr)
+		goto not_found;
+
+	r = genradix_ptr(&c->reflink_gc_table, r_idx);
+	next_idx = min(next_idx, r->offset - r->size);
+	if (*idx < next_idx)
+		goto not_found;
+
+	BUG_ON((s64) r->refcount + add < 0);
+
+	r->refcount += add;
+	*idx = r->offset;
+	return 0;
+not_found:
+	if (fsck_err(c, reflink_p_to_missing_reflink_v,
+		     "pointer to missing indirect extent\n"
+		     "  %s\n"
+		     "  missing range %llu-%llu",
+		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+		     *idx, next_idx)) {
+		struct bkey_i_error *new;
+
+		new = bch2_trans_kmalloc(trans, sizeof(*new));
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			goto err;
+
+		bkey_init(&new->k);
+		new->k.type	= KEY_TYPE_error;
+		new->k.p		= bkey_start_pos(p.k);
+		new->k.p.offset += *idx - start;
+		bch2_key_resize(&new->k, next_idx - *idx);
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
+					  BTREE_TRIGGER_NORUN);
+	}
+
+	*idx = next_idx;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int __mark_reflink_p(struct btree_trans *trans,
+			    enum btree_id btree_id, unsigned level,
+			    struct bkey_s_c k, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	struct reflink_gc *ref;
+	size_t l, r, m;
+	u64 idx = le64_to_cpu(p.v->idx), start = idx;
+	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+	int ret = 0;
+
+	BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
+		idx -= le32_to_cpu(p.v->front_pad);
+		end += le32_to_cpu(p.v->back_pad);
+	}
+
+	l = 0;
+	r = c->reflink_gc_nr;
+	while (l < r) {
+		m = l + (r - l) / 2;
+
+		ref = genradix_ptr(&c->reflink_gc_table, m);
+		if (ref->offset <= idx)
+			l = m + 1;
+		else
+			r = m;
+	}
+
+	while (idx < end && !ret)
+		ret = __bch2_mark_reflink_p(trans, p, start, end,
+					    &idx, flags, l++);
+
+	return ret;
+}
+
+int bch2_mark_reflink_p(struct btree_trans *trans,
+			enum btree_id btree_id, unsigned level,
+			struct bkey_s_c old, struct bkey_s_c new,
+			unsigned flags)
+{
+	return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+				struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *dst;
+	struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+	s64 added = 0;
+	unsigned i;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	/* revert changes: */
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+		BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+	}
+
+	dst->nr_inodes -= deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				-= deltas->persistent_reserved[i];
+		dst->reserved			-= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	-= deltas->persistent_reserved[i];
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors += added;
+		this_cpu_add(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+			      struct replicas_delta_list *deltas)
+{
+	struct bch_fs *c = trans->c;
+	static int warned_disk_usage = 0;
+	bool warn = false;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	struct replicas_delta *d, *d2;
+	struct replicas_delta *top = (void *) deltas->d + deltas->used;
+	struct bch_fs_usage *dst;
+	s64 added = 0, should_not_have_added;
+	unsigned i;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+	for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+		switch (d->r.data_type) {
+		case BCH_DATA_btree:
+		case BCH_DATA_user:
+		case BCH_DATA_parity:
+			added += d->delta;
+		}
+
+		if (__update_replicas(c, dst, &d->r, d->delta))
+			goto need_mark;
+	}
+
+	dst->nr_inodes += deltas->nr_inodes;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		added				+= deltas->persistent_reserved[i];
+		dst->reserved			+= deltas->persistent_reserved[i];
+		dst->persistent_reserved[i]	+= deltas->persistent_reserved[i];
+	}
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	should_not_have_added = added - (s64) disk_res_sectors;
+	if (unlikely(should_not_have_added > 0)) {
+		u64 old, new, v = atomic64_read(&c->sectors_available);
+
+		do {
+			old = v;
+			new = max_t(s64, 0, old - should_not_have_added);
+		} while ((v = atomic64_cmpxchg(&c->sectors_available,
+					       old, new)) != old);
+
+		added -= should_not_have_added;
+		warn = true;
+	}
+
+	if (added > 0) {
+		trans->disk_res->sectors -= added;
+		this_cpu_sub(*c->online_reserved, added);
+	}
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+
+	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+		bch2_trans_inconsistent(trans,
+					"disk usage increased %lli more than %llu sectors reserved)",
+					should_not_have_added, disk_res_sectors);
+	return 0;
+need_mark:
+	/* revert changes: */
+	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+	return -1;
+}
+
+/* trans_mark: */
+
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+				   enum btree_id btree_id, unsigned level,
+				   struct bkey_s_c k, struct extent_ptr_decoded p,
+				   unsigned flags)
+{
+	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	struct bpos bucket;
+	struct bch_backpointer bp;
+	s64 sectors;
+	int ret;
+
+	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+	sectors = bp.bucket_len;
+	if (!insert)
+		sectors = -sectors;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
+			     a->v.gen, &a->v.data_type,
+			     &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+		bch2_trans_update(trans, &iter, &a->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	if (!p.ptr.cached) {
+		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
+			struct extent_ptr_decoded p,
+			s64 sectors, enum bch_data_type data_type)
+{
+	struct btree_iter iter;
+	struct bkey_i_stripe *s;
+	struct bch_replicas_padded r;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_stripes, POS(0, p.ec.idx),
+			BTREE_ITER_WITH_UPDATES, stripe);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+			"pointer to nonexistent stripe %llu",
+			(u64) p.ec.idx);
+		goto err;
+	}
+
+	if (!bch2_ptr_matches_stripe(&s->v, p)) {
+		bch2_trans_inconsistent(trans,
+			"stripe pointer doesn't match stripe %llu",
+			(u64) p.ec.idx);
+		ret = -EIO;
+		goto err;
+	}
+
+	stripe_blockcount_set(&s->v, p.ec.block,
+		stripe_blockcount_get(&s->v, p.ec.block) +
+		sectors);
+
+	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+	r.e.data_type = data_type;
+	ret = update_replicas_list(trans, &r.e, sectors);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __trans_mark_extent(struct btree_trans *trans,
+			       enum btree_id btree_id, unsigned level,
+			       struct bkey_s_c k, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_replicas_padded r;
+	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
+		? BCH_DATA_btree
+		: BCH_DATA_user;
+	s64 sectors = bkey_is_btree_ptr(k.k)
+		? btree_sectors(c)
+		: k.k->size;
+	s64 dirty_sectors = 0;
+	bool stale;
+	int ret = 0;
+
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		s64 disk_sectors = ptr_disk_sectors(sectors, p);
+
+		if (flags & BTREE_TRIGGER_OVERWRITE)
+			disk_sectors = -disk_sectors;
+
+		ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+		if (ret < 0)
+			return ret;
+
+		stale = ret > 0;
+
+		if (p.ptr.cached) {
+			if (!stale) {
+				ret = update_cached_sectors_list(trans, p.ptr.dev,
+								 disk_sectors);
+				if (ret)
+					return ret;
+			}
+		} else if (!p.has_ec) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
+			ret = bch2_trans_mark_stripe_ptr(trans, p,
+					disk_sectors, data_type);
+			if (ret)
+				return ret;
+
+			r.e.nr_required = 0;
+		}
+	}
+
+	if (r.e.nr_devs)
+		ret = update_replicas_list(trans, &r.e, dirty_sectors);
+
+	return ret;
+}
+
+int bch2_trans_mark_extent(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+		  (int) bch2_bkey_needs_rebalance(c, old);
+
+	if (mod) {
+		int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+		if (ret)
+			return ret;
+	}
+
+	return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+					 struct bkey_s_c_stripe s,
+					 unsigned idx, bool deleting)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+		? BCH_DATA_parity : 0;
+	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+	int ret = 0;
+
+	if (deleting)
+		sectors = -sectors;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+			       a->v.gen, a->v.data_type,
+			       a->v.dirty_sectors, a->v.cached_sectors);
+	if (ret)
+		goto err;
+
+	if (!deleting) {
+		if (bch2_trans_inconsistent_on(a->v.stripe ||
+					       a->v.stripe_redundancy, trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
+				a->v.stripe, s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				bch2_data_types[a->v.data_type],
+				a->v.dirty_sectors,
+				s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		a->v.stripe		= s.k->p.offset;
+		a->v.stripe_redundancy	= s.v->nr_redundant;
+		a->v.data_type		= BCH_DATA_stripe;
+	} else {
+		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
+				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+				iter.pos.inode, iter.pos.offset, a->v.gen,
+				s.k->p.offset, a->v.stripe)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		a->v.stripe		= 0;
+		a->v.stripe_redundancy	= 0;
+		a->v.data_type		= alloc_data_type(a->v, BCH_DATA_user);
+	}
+
+	a->v.dirty_sectors += sectors;
+	if (data_type)
+		a->v.data_type = !deleting ? data_type : 0;
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_stripe(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_i *new,
+			   unsigned flags)
+{
+	const struct bch_stripe *old_s = NULL;
+	struct bch_stripe *new_s = NULL;
+	struct bch_replicas_padded r;
+	unsigned i, nr_blocks;
+	int ret = 0;
+
+	if (old.k->type == KEY_TYPE_stripe)
+		old_s = bkey_s_c_to_stripe(old).v;
+	if (new->k.type == KEY_TYPE_stripe)
+		new_s = &bkey_i_to_stripe(new)->v;
+
+	/*
+	 * If the pointers aren't changing, we don't need to do anything:
+	 */
+	if (new_s && old_s &&
+	    new_s->nr_blocks	== old_s->nr_blocks &&
+	    new_s->nr_redundant	== old_s->nr_redundant &&
+	    !memcmp(old_s->ptrs, new_s->ptrs,
+		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+		return 0;
+
+	BUG_ON(new_s && old_s &&
+	       (new_s->nr_blocks	!= old_s->nr_blocks ||
+		new_s->nr_redundant	!= old_s->nr_redundant));
+
+	nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+
+	if (new_s) {
+		s64 sectors = le16_to_cpu(new_s->sectors);
+
+		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
+		ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		if (ret)
+			return ret;
+	}
+
+	if (old_s) {
+		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+		bch2_bkey_to_replicas(&r.e, old);
+		ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < nr_blocks; i++) {
+		if (new_s && old_s &&
+		    !memcmp(&new_s->ptrs[i],
+			    &old_s->ptrs[i],
+			    sizeof(new_s->ptrs[i])))
+			continue;
+
+		if (new_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_i_to_s_c_stripe(new), i, false);
+			if (ret)
+				break;
+		}
+
+		if (old_s) {
+			ret = bch2_trans_mark_stripe_bucket(trans,
+					bkey_s_c_to_stripe(old), i, true);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static int __trans_mark_reservation(struct btree_trans *trans,
+				    enum btree_id btree_id, unsigned level,
+				    struct bkey_s_c k, unsigned flags)
+{
+	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
+	s64 sectors = (s64) k.k->size;
+	struct replicas_delta_list *d;
+	int ret;
+
+	if (flags & BTREE_TRIGGER_OVERWRITE)
+		sectors = -sectors;
+	sectors *= replicas;
+
+	ret = bch2_replicas_deltas_realloc(trans, 0);
+	if (ret)
+		return ret;
+
+	d = trans->fs_usage_deltas;
+	replicas = clamp_t(unsigned, replicas, 1,
+			   ARRAY_SIZE(d->persistent_reserved));
+
+	d->persistent_reserved[replicas - 1] += sectors;
+	return 0;
+}
+
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+				enum btree_id btree_id, unsigned level,
+				struct bkey_s_c old,
+				struct bkey_i *new,
+				unsigned flags)
+{
+	return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
+			struct bkey_s_c_reflink_p p,
+			u64 *idx, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i *k;
+	__le64 *refcount;
+	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	k = bch2_bkey_get_mut_noupdate(trans, &iter,
+			BTREE_ID_reflink, POS(0, *idx),
+			BTREE_ITER_WITH_UPDATES);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		goto err;
+
+	refcount = bkey_refcount(k);
+	if (!refcount) {
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
+		bch2_trans_inconsistent(trans,
+			"nonexistent indirect extent at %llu while marking\n  %s",
+			*idx, buf.buf);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+		bch2_bkey_val_to_text(&buf, c, p.s_c);
+		bch2_trans_inconsistent(trans,
+			"indirect extent refcount underflow at %llu while marking\n  %s",
+			*idx, buf.buf);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+		u64 pad;
+
+		pad = max_t(s64, le32_to_cpu(v->front_pad),
+			    le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+		BUG_ON(pad > U32_MAX);
+		v->front_pad = cpu_to_le32(pad);
+
+		pad = max_t(s64, le32_to_cpu(v->back_pad),
+			    k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+		BUG_ON(pad > U32_MAX);
+		v->back_pad = cpu_to_le32(pad);
+	}
+
+	le64_add_cpu(refcount, add);
+
+	bch2_btree_iter_set_pos_to_extent_start(&iter);
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	if (ret)
+		goto err;
+
+	*idx = k->k.p.offset;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+				enum btree_id btree_id, unsigned level,
+				struct bkey_s_c k, unsigned flags)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+	u64 idx, end_idx;
+	int ret = 0;
+
+	idx	= le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+	end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+		le32_to_cpu(p.v->back_pad);
+
+	while (idx < end_idx && !ret)
+		ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
+	return ret;
+}
+
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old,
+			      struct bkey_i *new,
+			      unsigned flags)
+{
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+		v->front_pad = v->back_pad = 0;
+	}
+
+	return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_alloc_v4 *a;
+	int ret = 0;
+
+	/*
+	 * Backup superblock might be past the end of our normal usable space:
+	 */
+	if (b >= ca->mi.nbuckets)
+		return 0;
+
+	a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
+	if (IS_ERR(a))
+		return PTR_ERR(a);
+
+	if (a->v.data_type && type && a->v.data_type != type) {
+		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+			      BCH_FSCK_ERR_bucket_metadata_type_mismatch,
+			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+			"while marking %s",
+			iter.pos.inode, iter.pos.offset, a->v.gen,
+			bch2_data_types[a->v.data_type],
+			bch2_data_types[type],
+			bch2_data_types[type]);
+		ret = -EIO;
+		goto err;
+	}
+
+	if (a->v.data_type	!= type ||
+	    a->v.dirty_sectors	!= sectors) {
+		a->v.data_type		= type;
+		a->v.dirty_sectors	= sectors;
+		ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+				    struct bch_dev *ca, size_t b,
+				    enum bch_data_type type,
+				    unsigned sectors)
+{
+	return commit_do(trans, NULL, NULL, 0,
+			__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+					    struct bch_dev *ca,
+					    u64 start, u64 end,
+					    enum bch_data_type type,
+					    u64 *bucket, unsigned *bucket_sectors)
+{
+	do {
+		u64 b = sector_to_bucket(ca, start);
+		unsigned sectors =
+			min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+		if (b != *bucket && *bucket_sectors) {
+			int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
+								  type, *bucket_sectors);
+			if (ret)
+				return ret;
+
+			*bucket_sectors = 0;
+		}
+
+		*bucket		= b;
+		*bucket_sectors	+= sectors;
+		start += sectors;
+	} while (start < end);
+
+	return 0;
+}
+
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+				    struct bch_dev *ca)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 bucket = 0;
+	unsigned i, bucket_sectors = 0;
+	int ret;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset == BCH_SB_SECTOR) {
+			ret = bch2_trans_mark_metadata_sectors(trans, ca,
+						0, BCH_SB_SECTOR,
+						BCH_DATA_sb, &bucket, &bucket_sectors);
+			if (ret)
+				return ret;
+		}
+
+		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
+				      offset + (1 << layout->sb_max_size_bits),
+				      BCH_DATA_sb, &bucket, &bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	if (bucket_sectors) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				bucket, BCH_DATA_sb, bucket_sectors);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < ca->journal.nr; i++) {
+		ret = bch2_trans_mark_metadata_bucket(trans, ca,
+				ca->journal.buckets[i],
+				BCH_DATA_journal, ca->mi.bucket_size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i) {
+		int ret = bch2_trans_mark_dev_sb(c, ca);
+		if (ret) {
+			percpu_ref_put(&ca->ref);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* Disk reservations: */
+
+#define SECTORS_CACHE	1024
+
+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      u64 sectors, int flags)
+{
+	struct bch_fs_pcpu *pcpu;
+	u64 old, v, get;
+	s64 sectors_available;
+	int ret;
+
+	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	pcpu = this_cpu_ptr(c->pcpu);
+
+	if (sectors <= pcpu->sectors_available)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
+			preempt_enable();
+			goto recalculate;
+		}
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, old - get)) != old);
+
+	pcpu->sectors_available		+= get;
+
+out:
+	pcpu->sectors_available		-= sectors;
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+
+	preempt_enable();
+	percpu_up_read(&c->mark_lock);
+	return 0;
+
+recalculate:
+	mutex_lock(&c->sectors_available_lock);
+
+	percpu_u64_set(&c->pcpu->sectors_available, 0);
+	sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		this_cpu_add(*c->online_reserved, sectors);
+		res->sectors			+= sectors;
+		ret = 0;
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -BCH_ERR_ENOSPC_disk_reservation;
+	}
+
+	mutex_unlock(&c->sectors_available_lock);
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+/* Startup/shutdown: */
+
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_gens *buckets =
+		container_of(rcu, struct bucket_gens, rcu);
+
+	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
+	unsigned long *buckets_nouse = NULL;
+	bool resize = ca->bucket_gens != NULL;
+	int ret;
+
+	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+					    GFP_KERNEL|__GFP_ZERO))) {
+		ret = -BCH_ERR_ENOMEM_bucket_gens;
+		goto err;
+	}
+
+	if ((c->opts.buckets_nouse &&
+	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)))) {
+		ret = -BCH_ERR_ENOMEM_buckets_nouse;
+		goto err;
+	}
+
+	bucket_gens->first_bucket = ca->mi.first_bucket;
+	bucket_gens->nbuckets	= nbuckets;
+
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->mark_lock);
+	}
+
+	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
+
+	if (resize) {
+		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
+
+		memcpy(bucket_gens->b,
+		       old_bucket_gens->b,
+		       n);
+		if (buckets_nouse)
+			memcpy(buckets_nouse,
+			       ca->buckets_nouse,
+			       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	}
+
+	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+	bucket_gens	= old_bucket_gens;
+
+	swap(ca->buckets_nouse, buckets_nouse);
+
+	nbuckets = ca->mi.nbuckets;
+
+	if (resize) {
+		percpu_up_write(&c->mark_lock);
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}
+
+	ret = 0;
+err:
+	kvpfree(buckets_nouse,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	if (bucket_gens)
+		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
+
+	return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+	unsigned i;
+
+	kvpfree(ca->buckets_nouse,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+		sizeof(struct bucket_gens) + ca->mi.nbuckets);
+
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+		free_percpu(ca->usage[i]);
+	kfree(ca->usage_base);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned i;
+
+	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+	if (!ca->usage_base)
+		return -BCH_ERR_ENOMEM_usage_init;
+
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[i])
+			return -BCH_ERR_ENOMEM_usage_init;
+	}
+
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
+}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
new file mode 100644
index 000000000000..21f6cb356921
--- /dev/null
+++ b/fs/bcachefs/buckets.h
@@ -0,0 +1,458 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "extents.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+	return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+	return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+	u32 remainder;
+
+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
+	return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+						 u32 *offset)
+{
+	return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
+
+#define for_each_bucket(_b, _buckets)				\
+	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
+	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR	0
+#else
+#define BUCKET_LOCK_BITNR	(BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+	ulong	ulong;
+	u8	byte;
+};
+
+static inline void bucket_unlock(struct bucket *b)
+{
+	BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+	clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+	wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
+}
+
+static inline void bucket_lock(struct bucket *b)
+{
+	wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+			 TASK_UNINTERRUPTIBLE);
+}
+
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->buckets_gc,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+	struct bucket_array *buckets = gc_bucket_array(ca);
+
+	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+	return buckets->b + b;
+}
+
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->bucket_gens,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+	struct bucket_gens *gens = bucket_gens(ca);
+
+	BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+	return gens->b + b;
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
+				   const struct bch_extent_ptr *ptr)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+						const struct bch_extent_ptr *ptr,
+						u32 *bucket_offset)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+	return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+					   const struct bch_extent_ptr *ptr)
+{
+	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
+					       const struct bch_extent_ptr *ptr)
+{
+	if (bkey_is_btree_ptr(k))
+		return BCH_DATA_btree;
+
+	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
+}
+
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+	EBUG_ON(sectors < 0);
+
+	return crc_is_compressed(p.crc)
+		? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+				   p.crc.uncompressed_size)
+		: sectors;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+	return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+	int r = gen_cmp(a, b);
+
+	return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	u8 ret;
+
+	rcu_read_lock();
+	ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/* Device usage: */
+
+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+{
+	struct bch_dev_usage ret;
+
+	bch2_dev_usage_read_fast(ca, &ret);
+	return ret;
+}
+
+void bch2_dev_usage_init(struct bch_dev *);
+
+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
+{
+	s64 reserved = 0;
+
+	switch (watermark) {
+	case BCH_WATERMARK_NR:
+		BUG();
+	case BCH_WATERMARK_stripe:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_normal:
+		reserved += ca->mi.nbuckets >> 6;
+		fallthrough;
+	case BCH_WATERMARK_copygc:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree:
+		reserved += ca->nr_btree_reserve;
+		fallthrough;
+	case BCH_WATERMARK_btree_copygc:
+	case BCH_WATERMARK_reclaim:
+		break;
+	}
+
+	return reserved;
+}
+
+static inline u64 dev_buckets_free(struct bch_dev *ca,
+				   struct bch_dev_usage usage,
+				   enum bch_watermark watermark)
+{
+	return max_t(s64, 0,
+		     usage.d[BCH_DATA_free].buckets -
+		     ca->nr_open_buckets -
+		     bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage usage,
+					  enum bch_watermark watermark)
+{
+	return max_t(s64, 0,
+		       usage.d[BCH_DATA_free].buckets
+		     + usage.d[BCH_DATA_cached].buckets
+		     + usage.d[BCH_DATA_need_gc_gens].buckets
+		     + usage.d[BCH_DATA_need_discard].buckets
+		     - ca->nr_open_buckets
+		     - bch2_dev_buckets_reserved(ca, watermark));
+}
+
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+					enum bch_watermark watermark)
+{
+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark);
+}
+
+/* Filesystem usage: */
+
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
+{
+	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned dev_usage_u64s(void)
+{
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
+
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
+
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
+
+void bch2_fs_usage_to_text(struct printbuf *,
+			   struct bch_fs *, struct bch_fs_usage_online *);
+
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
+
+struct bch_fs_usage_short
+bch2_fs_usage_read_short(struct bch_fs *);
+
+/* key/bucket marking: */
+
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
+						unsigned journal_seq,
+						bool gc)
+{
+	percpu_rwsem_assert_held(&c->mark_lock);
+	BUG_ON(!gc && !journal_seq);
+
+	return this_cpu_ptr(gc
+			    ? c->usage_gc
+			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
+int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
+
+void bch2_fs_usage_initialize(struct bch_fs *);
+
+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			      size_t, enum bch_data_type, unsigned,
+			      struct gc_pos, unsigned);
+
+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
+		     struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+			struct bkey_s_c, struct bkey_s_c, unsigned);
+
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({												\
+	int ret = 0;										\
+												\
+	if (_old.k->type)									\
+		ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT);	\
+	if (!ret && _new.k->type)								\
+		ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE);	\
+	ret;											\
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)	\
+	mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
+				    size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
+
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	u64 b_offset	= bucket_to_sector(ca, b);
+	u64 b_end	= bucket_to_sector(ca, b + 1);
+	unsigned i;
+
+	if (!b)
+		return true;
+
+	for (i = 0; i < layout->nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+		u64 end = offset + (1 << layout->sb_max_size_bits);
+
+		if (!(offset >= b_end || end <= b_offset))
+			return true;
+	}
+
+	return false;
+}
+
+/* disk reservations: */
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors) {
+		this_cpu_sub(*c->online_reserved, res->sectors);
+		res->sectors = 0;
+	}
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+
+int __bch2_disk_reservation_add(struct bch_fs *,
+				struct disk_reservation *,
+				u64, int);
+
+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+					    u64 sectors, int flags)
+{
+#ifdef __KERNEL__
+	u64 old, new;
+
+	do {
+		old = this_cpu_read(c->pcpu->sectors_available);
+		if (sectors > old)
+			return __bch2_disk_reservation_add(c, res, sectors, flags);
+
+		new = old - sectors;
+	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+
+	this_cpu_add(*c->online_reserved, sectors);
+	res->sectors			+= sectors;
+	return 0;
+#else
+	return __bch2_disk_reservation_add(c, res, sectors, flags);
+#endif
+}
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+	return (struct disk_reservation) {
+		.sectors	= 0,
+#if 0
+		/* not used yet: */
+		.gen		= c->capacity_gen,
+#endif
+		.nr_replicas	= nr_replicas,
+	};
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+					    struct disk_reservation *res,
+					    u64 sectors, unsigned nr_replicas,
+					    int flags)
+{
+	*res = bch2_disk_reservation_init(c, nr_replicas);
+
+	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+#define RESERVE_FACTOR	6
+
+static inline u64 avail_factor(u64 r)
+{
+	return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
new file mode 100644
index 000000000000..2a9dab9006ef
--- /dev/null
+++ b/fs/bcachefs/buckets_types.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "bcachefs_format.h"
+#include "util.h"
+
+#define BUCKET_JOURNAL_SEQ_BITS		16
+
+struct bucket {
+	u8			lock;
+	u8			gen_valid:1;
+	u8			data_type:7;
+	u8			gen;
+	u8			stripe_redundancy;
+	u32			stripe;
+	u32			dirty_sectors;
+	u32			cached_sectors;
+};
+
+struct bucket_array {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	struct bucket		b[];
+};
+
+struct bucket_gens {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	u8			b[];
+};
+
+struct bch_dev_usage {
+	u64			buckets_ec;
+
+	struct {
+		u64		buckets;
+		u64		sectors; /* _compressed_ sectors: */
+		/*
+		 * XXX
+		 * Why do we have this? Isn't it just buckets * bucket_size -
+		 * sectors?
+		 */
+		u64		fragmented;
+	}			d[BCH_DATA_NR];
+};
+
+struct bch_fs_usage {
+	/* all fields are in units of 512 byte sectors: */
+	u64			hidden;
+	u64			btree;
+	u64			data;
+	u64			cached;
+	u64			reserved;
+	u64			nr_inodes;
+
+	/* XXX: add stats for compression ratio */
+#if 0
+	u64			uncompressed;
+	u64			compressed;
+#endif
+
+	/* broken out: */
+
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	u64			replicas[];
+};
+
+struct bch_fs_usage_online {
+	u64			online_reserved;
+	struct bch_fs_usage	u;
+};
+
+struct bch_fs_usage_short {
+	u64			capacity;
+	u64			used;
+	u64			free;
+	u64			nr_inodes;
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64			sectors;
+	u32			gen;
+	unsigned		nr_replicas;
+};
+
+#endif /* _BUCKETS_TYPES_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
new file mode 100644
index 000000000000..ec1b636ef78d
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/hash.h>
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+	    unsigned hash_seed_idx, u64 dev_bucket)
+{
+	return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits);
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits)
+{
+	unsigned i;
+
+	t->bits = bits;
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+	memset(t->d, 0, sizeof(t->d[0]) << t->bits);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+				      u64 flushed_seq,
+				      unsigned dev, u64 bucket)
+{
+	struct buckets_waiting_for_journal_table *t;
+	u64 dev_bucket = (u64) dev << 56 | bucket;
+	bool ret = false;
+	unsigned i;
+
+	mutex_lock(&b->lock);
+	t = b->t;
+
+	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+		if (h->dev_bucket == dev_bucket) {
+			ret = h->journal_seq > flushed_seq;
+			break;
+		}
+	}
+
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+				struct bucket_hashed *new,
+				u64 flushed_seq)
+{
+	struct bucket_hashed *last_evicted = NULL;
+	unsigned tries, i;
+
+	for (tries = 0; tries < 10; tries++) {
+		struct bucket_hashed *old, *victim = NULL;
+
+		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+			old = bucket_hash(t, i, new->dev_bucket);
+
+			if (old->dev_bucket == new->dev_bucket ||
+			    old->journal_seq <= flushed_seq) {
+				*old = *new;
+				return true;
+			}
+
+			if (last_evicted != old)
+				victim = old;
+		}
+
+		/* hashed to same slot 3 times: */
+		if (!victim)
+			break;
+
+		/* Failed to find an empty slot: */
+		swap(*new, *victim);
+		last_evicted = victim;
+	}
+
+	return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+					 u64 flushed_seq,
+					 unsigned dev, u64 bucket,
+					 u64 journal_seq)
+{
+	struct buckets_waiting_for_journal_table *t, *n;
+	struct bucket_hashed tmp, new = {
+		.dev_bucket	= (u64) dev << 56 | bucket,
+		.journal_seq	= journal_seq,
+	};
+	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+	int ret = 0;
+
+	mutex_lock(&b->lock);
+
+	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+		goto out;
+
+	t = b->t;
+	size = 1UL << t->bits;
+	for (i = 0; i < size; i++)
+		nr_elements += t->d[i].journal_seq > flushed_seq;
+
+	new_bits = t->bits + (nr_elements * 3 > size);
+
+	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
+	if (!n) {
+		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+		goto out;
+	}
+
+retry_rehash:
+	nr_rehashes++;
+	bucket_table_init(n, new_bits);
+
+	tmp = new;
+	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+	for (i = 0; i < 1UL << t->bits; i++) {
+		if (t->d[i].journal_seq <= flushed_seq)
+			continue;
+
+		tmp = t->d[i];
+		if (!bucket_table_insert(n, &tmp, flushed_seq))
+			goto retry_rehash;
+	}
+
+	b->t = n;
+	kvfree(t);
+
+	pr_debug("took %zu rehashes, table at %zu/%lu elements",
+		 nr_rehashes, nr_elements, 1UL << b->t->bits);
+out:
+	mutex_unlock(&b->lock);
+
+	return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	kvfree(b->t);
+}
+
+#define INITIAL_TABLE_BITS		3
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+	mutex_init(&b->lock);
+
+	b->t = kvmalloc(sizeof(*b->t) +
+			(sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL);
+	if (!b->t)
+		return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init;
+
+	bucket_table_init(b->t, INITIAL_TABLE_BITS);
+	return 0;
+}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
new file mode 100644
index 000000000000..d2ae19cbe18c
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+				      u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+					 u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
new file mode 100644
index 000000000000..e593db061d81
--- /dev/null
+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+	u64			dev_bucket;
+	u64			journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+	unsigned		bits;
+	u64			hash_seeds[3];
+	struct bucket_hashed	d[];
+};
+
+struct buckets_waiting_for_journal {
+	struct mutex		lock;
+	struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
new file mode 100644
index 000000000000..4bb88aefed12
--- /dev/null
+++ b/fs/bcachefs/chardev.c
@@ -0,0 +1,784 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "journal.h"
+#include "move.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+					  unsigned flags)
+{
+	struct bch_dev *ca;
+
+	if (flags & BCH_BY_INDEX) {
+		if (dev >= c->sb.nr_devices)
+			return ERR_PTR(-EINVAL);
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			return ERR_PTR(-EINVAL);
+	} else {
+		char *path;
+
+		path = strndup_user((const char __user *)
+				    (unsigned long) dev, PATH_MAX);
+		if (IS_ERR(path))
+			return ERR_CAST(path);
+
+		ca = bch2_dev_lookup(c, path);
+		kfree(path);
+	}
+
+	return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	struct bch_fs *c;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!user_devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		ret= PTR_ERR_OR_ZERO(devs[i]);
+		if (ret)
+			goto err;
+	}
+
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	err = bch2_fs_open_incremental(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcachefs devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_ASSEMBLE:
+		return bch2_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch2_ioctl_incremental(arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+			 sizeof(c->sb.user_uuid)))
+		return -EFAULT;
+	return 0;
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	return bch2_fs_start(c);
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	bch2_fs_stop(c);
+	return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	ret = bch2_dev_add(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	ret = PTR_ERR_OR_ZERO(path);
+	if (ret)
+		return ret;
+
+	ret = bch2_dev_online(c, path);
+	kfree(path);
+	return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_offline(c, ca, arg.flags);
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+			struct bch_ioctl_disk_set_state arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad[0] || arg.pad[1] || arg.pad[2] ||
+	    arg.new_state >= BCH_MEMBER_STATE_NR)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+	if (ret)
+		bch_err(c, "Error setting device state: %s", bch2_err_str(ret));
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+struct bch_data_ctx {
+	struct bch_fs			*c;
+	struct bch_ioctl_data		arg;
+	struct bch_move_stats		stats;
+
+	int				ret;
+
+	struct task_struct		*thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+	struct bch_data_ctx *ctx = arg;
+
+	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+	ctx->stats.data_type = U8_MAX;
+	return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+
+	kthread_stop(ctx->thread);
+	put_task_struct(ctx->thread);
+	kfree(ctx);
+	return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_fs *c = ctx->c;
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.pos.btree,
+		.p.pos			= ctx->stats.pos.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_usage_read_short(c).used,
+	};
+
+	if (len < sizeof(e))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &e, sizeof(e)))
+		return -EFAULT;
+
+	return sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+	.release	= bch2_data_job_release,
+	.read		= bch2_data_job_read,
+	.llseek		= no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+			    struct bch_ioctl_data arg)
+{
+	struct bch_data_ctx *ctx = NULL;
+	struct file *file = NULL;
+	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+	int ret, fd = -1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->c = c;
+	ctx->arg = arg;
+
+	ctx->thread = kthread_create(bch2_data_thread, ctx,
+				     "bch-data/%s", c->name);
+	if (IS_ERR(ctx->thread)) {
+		ret = PTR_ERR(ctx->thread);
+		goto err;
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+	fd_install(fd, file);
+
+	get_task_struct(ctx->thread);
+	wake_up_process(ctx->thread);
+
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (!IS_ERR_OR_NULL(ctx->thread))
+		kthread_stop(ctx->thread);
+	kfree(ctx);
+	return ret;
+}
+
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
+				struct bch_ioctl_fs_usage __user *user_arg)
+{
+	struct bch_ioctl_fs_usage *arg = NULL;
+	struct bch_replicas_usage *dst_e, *dst_end;
+	struct bch_fs_usage_online *src;
+	u32 replica_entries_bytes;
+	unsigned i;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
+		return -EFAULT;
+
+	arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
+	if (!arg)
+		return -ENOMEM;
+
+	src = bch2_fs_usage_read(c);
+	if (!src) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	arg->capacity		= c->capacity;
+	arg->used		= bch2_fs_sectors_used(c, src);
+	arg->online_reserved	= src->online_reserved;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++)
+		arg->persistent_reserved[i] = src->u.persistent_reserved[i];
+
+	dst_e	= arg->replicas;
+	dst_end = (void *) arg->replicas + replica_entries_bytes;
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *src_e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		/* check that we have enough space for one replicas entry */
+		if (dst_e + 1 > dst_end) {
+			ret = -ERANGE;
+			break;
+		}
+
+		dst_e->sectors		= src->u.replicas[i];
+		dst_e->r		= *src_e;
+
+		/* recheck after setting nr_devs: */
+		if (replicas_usage_next(dst_e) > dst_end) {
+			ret = -ERANGE;
+			break;
+		}
+
+		memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+
+		dst_e = replicas_usage_next(dst_e);
+	}
+
+	arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
+
+	percpu_up_read(&c->mark_lock);
+	kfree(src);
+
+	if (ret)
+		goto err;
+	if (copy_to_user(user_arg, arg,
+			 sizeof(*arg) + arg->replica_entries_bytes))
+		ret = -EFAULT;
+err:
+	kfree(arg);
+	return ret;
+}
+
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
+				 struct bch_ioctl_dev_usage __user *user_arg)
+{
+	struct bch_ioctl_dev_usage arg;
+	struct bch_dev_usage src;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad[0] ||
+	    arg.pad[1] ||
+	    arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	src = bch2_dev_usage_read(ca);
+
+	arg.state		= ca->mi.state;
+	arg.bucket_size		= ca->mi.bucket_size;
+	arg.nr_buckets		= ca->mi.nbuckets - ca->mi.first_bucket;
+	arg.buckets_ec		= src.buckets_ec;
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		arg.d[i].buckets	= src.d[i].buckets;
+		arg.d[i].sectors	= src.d[i].sectors;
+		arg.d[i].fragmented	= src.d[i].fragmented;
+	}
+
+	percpu_ref_put(&ca->ref);
+
+	if (copy_to_user(user_arg, &arg, sizeof(arg)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+				  struct bch_ioctl_read_super arg)
+{
+	struct bch_dev *ca = NULL;
+	struct bch_sb *sb;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	if (arg.flags & BCH_READ_DEV) {
+		ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+		if (IS_ERR(ca)) {
+			ret = PTR_ERR(ca);
+			goto err;
+		}
+
+		sb = ca->disk_sb.sb;
+	} else {
+		sb = c->disk_sb.sb;
+	}
+
+	if (vstruct_bytes(sb) > arg.size) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+			 vstruct_bytes(sb)))
+		ret = -EFAULT;
+err:
+	if (!IS_ERR_OR_NULL(ca))
+		percpu_ref_put(&ca->ref);
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+				    struct bch_ioctl_disk_get_idx arg)
+{
+	dev_t dev = huge_decode_dev(arg.dev);
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!dev)
+		return -EINVAL;
+
+	for_each_online_member(ca, c, i)
+		if (ca->dev == dev) {
+			percpu_ref_put(&ca->io_ref);
+			return i;
+		}
+
+	return -BCH_ERR_ENOENT_dev_idx_not_found;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize_journal arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	if (arg.nbuckets > U32_MAX)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)					\
+do {									\
+	_argtype i;							\
+									\
+	if (copy_from_user(&i, arg, sizeof(i)))				\
+		return -EFAULT;						\
+	ret = bch2_ioctl_##_name(c, i);					\
+	goto out;							\
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+	long ret;
+
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch2_ioctl_query_uuid(c, arg);
+	case BCH_IOCTL_FS_USAGE:
+		return bch2_ioctl_fs_usage(c, arg);
+	case BCH_IOCTL_DEV_USAGE:
+		return bch2_ioctl_dev_usage(c, arg);
+#if 0
+	case BCH_IOCTL_START:
+		BCH_IOCTL(start, struct bch_ioctl_start);
+	case BCH_IOCTL_STOP:
+		return bch2_ioctl_stop(c);
+#endif
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	switch (cmd) {
+	case BCH_IOCTL_DISK_ADD:
+		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_REMOVE:
+		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_ONLINE:
+		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_OFFLINE:
+		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_SET_STATE:
+		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+	case BCH_IOCTL_DATA:
+		BCH_IOCTL(data, struct bch_ioctl_data);
+	case BCH_IOCTL_DISK_RESIZE:
+		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+	case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+		BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
+
+	default:
+		return -ENOTTY;
+	}
+out:
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+	return ret;
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	unsigned minor = iminor(file_inode(filp));
+	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch2_fs_ioctl(c, cmd, arg)
+		: bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch2_chardev_ioctl,
+	.open		= nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), c,
+				   "bcachefs%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, U8_MAX));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		return bch_chardev_major;
+
+	bch_chardev_class = class_create("bcachefs");
+	if (IS_ERR(bch_chardev_class))
+		return PTR_ERR(bch_chardev_class);
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, U8_MAX),
+				    NULL, "bcachefs-ctl");
+	if (IS_ERR(bch_chardev))
+		return PTR_ERR(bch_chardev);
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
new file mode 100644
index 000000000000..0f563ca53c36
--- /dev/null
+++ b/fs/bcachefs/chardev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOTTY;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
new file mode 100644
index 000000000000..3c761ad6b1c8
--- /dev/null
+++ b/fs/bcachefs/checksum.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "errcode.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/xxhash.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages.
+ * it features page merging without having the checksum algorithm lose its state.
+ * for native checksum aglorithms (like crc), a default seed value will do.
+ * for hash-like algorithms, a state needs to be stored
+ */
+
+struct bch2_checksum_state {
+	union {
+		u64 seed;
+		struct xxh64_state h64state;
+	};
+	unsigned int type;
+};
+
+static void bch2_checksum_init(struct bch2_checksum_state *state)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		state->seed = 0;
+		break;
+	case BCH_CSUM_crc32c_nonzero:
+		state->seed = U32_MAX;
+		break;
+	case BCH_CSUM_crc64_nonzero:
+		state->seed = U64_MAX;
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_reset(&state->h64state, 0);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return state->seed;
+	case BCH_CSUM_crc32c_nonzero:
+		return state->seed ^ U32_MAX;
+	case BCH_CSUM_crc64_nonzero:
+		return state->seed ^ U64_MAX;
+	case BCH_CSUM_xxhash:
+		return xxh64_digest(&state->h64state);
+	default:
+		BUG();
+	}
+}
+
+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
+{
+	switch (state->type) {
+	case BCH_CSUM_none:
+		return;
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc32c:
+		state->seed = crc32c(state->seed, data, len);
+		break;
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc64:
+		state->seed = crc64_be(state->seed, data, len);
+		break;
+	case BCH_CSUM_xxhash:
+		xxh64_update(&state->h64state, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				struct nonce nonce,
+				struct scatterlist *sg, size_t len)
+{
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	int ret;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+	ret = crypto_skcipher_encrypt(req);
+	if (ret)
+		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
+
+	return ret;
+}
+
+static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
+			      struct nonce nonce,
+			      void *buf, size_t len)
+{
+	if (!is_vmalloc_addr(buf)) {
+		struct scatterlist sg;
+
+		sg_init_table(&sg, 1);
+		sg_set_page(&sg,
+			    is_vmalloc_addr(buf)
+			    ? vmalloc_to_page(buf)
+			    : virt_to_page(buf),
+			    len, offset_in_page(buf));
+		return do_encrypt_sg(tfm, nonce, &sg, len);
+	} else {
+		unsigned pages = buf_pages(buf, len);
+		struct scatterlist *sg;
+		size_t orig_len = len;
+		int ret, i;
+
+		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
+		if (!sg)
+			return -BCH_ERR_ENOMEM_do_encrypt;
+
+		sg_init_table(sg, pages);
+
+		for (i = 0; i < pages; i++) {
+			unsigned offset = offset_in_page(buf);
+			unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+
+			sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+			buf += pg_len;
+			len -= pg_len;
+		}
+
+		ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+		kfree(sg);
+		return ret;
+	}
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+			    void *buf, size_t len)
+{
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret;
+
+	ret = PTR_ERR_OR_ZERO(chacha20);
+	if (ret) {
+		pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
+	if (ret) {
+		pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	ret = do_encrypt(chacha20, nonce, buf, len);
+err:
+	crypto_free_sync_skcipher(chacha20);
+	return ret;
+}
+
+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			struct nonce nonce)
+{
+	u8 key[POLY1305_KEY_SIZE];
+	int ret;
+
+	nonce.d[3] ^= BCH_NONCE_POLY;
+
+	memset(key, 0, sizeof(key));
+	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
+	if (ret)
+		return ret;
+
+	desc->tfm = c->poly1305;
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, key, sizeof(key));
+	return 0;
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+			      struct nonce nonce, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
+
+		state.type = type;
+
+		bch2_checksum_init(&state);
+		bch2_checksum_update(&state, data, len);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+	}
+
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+		crypto_shash_update(desc, data, len);
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+int bch2_encrypt(struct bch_fs *c, unsigned type,
+		  struct nonce nonce, void *data, size_t len)
+{
+	if (!bch2_csum_type_is_encryption(type))
+		return 0;
+
+	return do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
+{
+	struct bio_vec bv;
+
+	switch (type) {
+	case BCH_CSUM_none:
+		return (struct bch_csum) { 0 };
+	case BCH_CSUM_crc32c_nonzero:
+	case BCH_CSUM_crc64_nonzero:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_xxhash:
+	case BCH_CSUM_crc64: {
+		struct bch2_checksum_state state;
+
+		state.type = type;
+		bch2_checksum_init(&state);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+			bch2_checksum_update(&state, p, bv.bv_len);
+			kunmap_local(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
+	}
+
+	case BCH_CSUM_chacha20_poly1305_80:
+	case BCH_CSUM_chacha20_poly1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
+			crypto_shash_update(desc, p, bv.bv_len);
+			kunmap_local(p);
+		}
+#else
+		__bio_for_each_bvec(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		     struct nonce nonce, struct bio *bio)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct scatterlist sgl[16], *sg = sgl;
+	size_t bytes = 0;
+	int ret = 0;
+
+	if (!bch2_csum_type_is_encryption(type))
+		return 0;
+
+	sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sg == sgl + ARRAY_SIZE(sgl)) {
+			sg_mark_end(sg - 1);
+
+			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+			if (ret)
+				return ret;
+
+			nonce = nonce_add(nonce, bytes);
+			bytes = 0;
+
+			sg_init_table(sgl, ARRAY_SIZE(sgl));
+			sg = sgl;
+		}
+
+		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+		bytes += bv.bv_len;
+	}
+
+	sg_mark_end(sg - 1);
+	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
+				    struct bch_csum b, size_t b_len)
+{
+	struct bch2_checksum_state state;
+
+	state.type = type;
+	bch2_checksum_init(&state);
+	state.seed = le64_to_cpu(a.lo);
+
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
+
+		bch2_checksum_update(&state,
+				page_address(ZERO_PAGE(0)), page_len);
+		b_len -= page_len;
+	}
+	a.lo = cpu_to_le64(bch2_checksum_final(&state));
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type, { 0 }},
+		{ crc_b, len_b, new_csum_type, { 0 } },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_is_compressed(crc_old));
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
+			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+			__func__,
+			crc_old.csum.hi,
+			crc_old.csum.lo,
+			merged.hi,
+			merged.lo,
+			bch2_csum_types[crc_old.csum_type],
+			bch2_csum_types[new_csum_type]);
+		return -EIO;
+	}
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compression_type	= crc_old.compression_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+		return -BCH_ERR_invalid_sb_crypt;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+	prt_newline(out);
+	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+	.validate	= bch2_sb_crypt_validate,
+	.to_text	= bch2_sb_crypt_to_text,
+};
+
+#ifdef __KERNEL__
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int ret;
+
+	keyring_key = request_key(&key_type_user, key_description, NULL);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	down_read(&keyring_key->sem);
+	ukp = dereference_key_locked(keyring_key);
+	if (ukp->datalen == sizeof(*key)) {
+		memcpy(key, ukp->data, ukp->datalen);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return ret;
+}
+#else
+#include <keyutils.h>
+
+static int __bch2_request_key(char *key_description, struct bch_key *key)
+{
+	key_serial_t key_id;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_SESSION_KEYRING);
+	if (key_id >= 0)
+		goto got_key;
+
+	return -errno;
+got_key:
+
+	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+		return -1;
+
+	return 0;
+}
+
+#include "../crypto.h"
+#endif
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	struct printbuf key_description = PRINTBUF;
+	int ret;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	ret = __bch2_request_key(key_description.buf, key);
+	printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+	if (ret) {
+		char *passphrase = read_passphrase("Enter passphrase: ");
+		struct bch_encrypted_key sb_key;
+
+		bch2_passphrase_check(sb, passphrase,
+				      key, &sb_key);
+		ret = 0;
+	}
+#endif
+
+	/* stash with memfd, pass memfd fd to mount */
+
+	return ret;
+}
+
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+	key_serial_t key_id;
+	struct printbuf key_description = PRINTBUF;
+
+	prt_printf(&key_description, "bcachefs:");
+	pr_uuid(&key_description, sb->user_uuid.b);
+
+	key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+	printbuf_exit(&key_description);
+	if (key_id < 0)
+		return errno;
+
+	keyctl_revoke(key_id);
+
+	return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+			struct bch_sb_field_crypt *crypt,
+			struct bch_key *key)
+{
+	struct bch_encrypted_key sb_key = crypt->key;
+	struct bch_key user_key;
+	int ret = 0;
+
+	/* is key encrypted? */
+	if (!bch2_key_is_encrypted(&sb_key))
+		goto out;
+
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
+	if (ret) {
+		bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+		goto err;
+	}
+
+	/* decrypt real key: */
+	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+				      &sb_key, sizeof(sb_key));
+	if (ret)
+		goto err;
+
+	if (bch2_key_is_encrypted(&sb_key)) {
+		bch_err(c, "incorrect encryption key");
+		ret = -EINVAL;
+		goto err;
+	}
+out:
+	*key = sb_key.key;
+err:
+	memzero_explicit(&sb_key, sizeof(sb_key));
+	memzero_explicit(&user_key, sizeof(user_key));
+	return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+	int ret;
+
+	if (!c->chacha20)
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->chacha20);
+
+	if (ret) {
+		bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	if (!c->poly1305)
+		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->poly1305);
+
+	if (ret) {
+		bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+	if (!crypt)
+		goto out;
+
+	/* is key encrypted? */
+	ret = 0;
+	if (bch2_key_is_encrypted(&crypt->key))
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
+	crypt->key.key		= key;
+
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+	struct bch_encrypted_key key;
+	struct bch_key user_key;
+	struct bch_sb_field_crypt *crypt;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	/* Do we already have an encryption key? */
+	if (bch2_sb_field_get(c->disk_sb.sb, crypt))
+		goto err;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto err;
+
+	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
+	get_random_bytes(&key.key, sizeof(key.key));
+
+	if (keyed) {
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
+		if (ret) {
+			bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret));
+			goto err;
+		}
+
+		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+					      &key, sizeof(key));
+		if (ret)
+			goto err;
+	}
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto err;
+
+	crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+				     sizeof(*crypt) / sizeof(u64));
+	if (!crypt) {
+		ret = -BCH_ERR_ENOSPC_sb_crypt;
+		goto err;
+	}
+
+	crypt->key = key;
+
+	/* write superblock */
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	memzero_explicit(&user_key, sizeof(user_key));
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->poly1305))
+		crypto_free_shash(c->poly1305);
+	if (!IS_ERR_OR_NULL(c->chacha20))
+		crypto_free_sync_skcipher(c->chacha20);
+	if (!IS_ERR_OR_NULL(c->sha256))
+		crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = 0;
+
+	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+	ret = PTR_ERR_OR_ZERO(c->sha256);
+	if (ret) {
+		bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
+		goto out;
+	}
+
+	crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
+	if (!crypt)
+		goto out;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto out;
+out:
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
new file mode 100644
index 000000000000..13998388c545
--- /dev/null
+++ b/fs/bcachefs/checksum.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <linux/crc64.h>
+#include <crypto/chacha.h>
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_none:
+	case BCH_CSUM_crc32c:
+	case BCH_CSUM_crc64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
+				    struct bch_csum, size_t);
+
+#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+			     const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)				\
+({									\
+	const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
+									\
+	bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
+
+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+		 void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
+int __bch2_encrypt_bio(struct bch_fs *, unsigned,
+		       struct nonce, struct bio *);
+
+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+				   struct nonce nonce, struct bio *bio)
+{
+	return bch2_csum_type_is_encryption(type)
+		? __bch2_encrypt_bio(c, type, nonce, bio)
+		: 0;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+			struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_none:
+		return BCH_CSUM_none;
+	case BCH_CSUM_OPT_crc32c:
+		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
+	case BCH_CSUM_OPT_crc64:
+		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
+	case BCH_CSUM_OPT_xxhash:
+		return BCH_CSUM_xxhash;
+	default:
+		BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 struct bch_io_opts opts)
+{
+	if (opts.nocow)
+		return 0;
+
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
+
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+	if (c->sb.encryption_type)
+		return BCH_CSUM_chacha20_poly1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+					   unsigned type)
+{
+	if (type >= BCH_CSUM_NR)
+		return false;
+
+	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+		return false;
+
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+	/*
+	 * XXX: need some way of preventing the compiler from optimizing this
+	 * into a form that isn't constant time..
+	 */
+	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+	return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned compression_type = crc_is_compressed(crc)
+		? crc.compression_type
+		: 0;
+	unsigned size = compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+	__le64 magic = __bch2_sb_magic(sb);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+	__le64 magic = bch2_sb_magic(c);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
new file mode 100644
index 000000000000..f41889093a2c
--- /dev/null
+++ b/fs/bcachefs/clock.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+				struct io_timer *l,
+				struct io_timer *r)
+{
+	return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
+			  timer->expire)) {
+		spin_unlock(&clock->timer_lock);
+		timer->fn(timer);
+		return;
+	}
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp, NULL);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	schedule();
+
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				unsigned long io_until,
+				unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct io_clock_wait wait;
+
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	del_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+{
+	struct io_timer *timer;
+	unsigned long now = atomic64_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
+{
+	unsigned long now;
+	unsigned i;
+
+	out->atomic++;
+	spin_lock(&clock->timer_lock);
+	now = atomic64_read(&clock->now);
+
+	for (i = 0; i < clock->timers.used; i++)
+		prt_printf(out, "%ps:\t%li\n",
+		       clock->timers.data[i]->fn,
+		       clock->timers.data[i]->expire - now);
+	spin_unlock(&clock->timer_lock);
+	--out->atomic;
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+	atomic64_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -BCH_ERR_ENOMEM_io_clock_init;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -BCH_ERR_ENOMEM_io_clock_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
new file mode 100644
index 000000000000..70a0f7436c84
--- /dev/null
+++ b/fs/bcachefs/clock.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+
+void __bch2_increment_clock(struct io_clock *, unsigned);
+
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+					int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+
+	if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
+		   IO_CLOCK_PCPU_SECTORS))
+		__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
new file mode 100644
index 000000000000..5fae0012d808
--- /dev/null
+++ b/fs/bcachefs/clock_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+typedef HEAP(struct io_timer *)	io_timer_heap;
+
+struct io_clock {
+	atomic64_t		now;
+	u16 __percpu		*pcpu_buf;
+	unsigned		max_slop;
+
+	spinlock_t		timer_lock;
+	io_timer_heap		timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
new file mode 100644
index 000000000000..51af8ea230ed
--- /dev/null
+++ b/fs/bcachefs/compress.c
@@ -0,0 +1,732 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+	void *b;
+
+	BUG_ON(size > c->opts.encoded_extent_max);
+
+	b = kmalloc(size, GFP_NOFS|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
+}
+
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	void *expected_start = NULL;
+
+	__bio_for_each_bvec(bv, bio, iter, start) {
+		if (expected_start &&
+		    expected_start != page_address(bv.bv_page) + bv.bv_offset)
+			return false;
+
+		expected_start = page_address(bv.bv_page) +
+			bv.bv_offset + bv.bv_len;
+	}
+
+	return true;
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
+{
+	struct bbuf ret;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	void *data;
+
+	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
+
+	if (!PageHighMem(bio_iter_page(bio, start)) &&
+	    bio_phys_contig(bio, start))
+		return (struct bbuf) {
+			.b = page_address(bio_iter_page(bio, start)) +
+				bio_iter_offset(bio, start),
+			.type = BB_NONE, .rw = rw
+		};
+
+	/* check if we can map the pages contiguously: */
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if (iter.bi_size != start.bi_size &&
+		    bv.bv_offset)
+			goto bounce;
+
+		if (bv.bv_len < iter.bi_size &&
+		    bv.bv_offset + bv.bv_len < PAGE_SIZE)
+			goto bounce;
+
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
+bounce:
+	ret = __bounce_alloc(c, start.bi_size, rw);
+
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);
+
+	return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(buf.b, &c->compression_bounce[buf.rw]);
+		break;
+	}
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+	strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf src_data = { NULL };
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	void *workspace;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4_old:
+	case BCH_COMPRESSION_TYPE_lz4:
+		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+						  src_len, dst_len, dst_len);
+		if (ret != dst_len)
+			goto err;
+		break;
+	case BCH_COMPRESSION_TYPE_gzip: {
+		z_stream strm = {
+			.next_in	= src_data.b,
+			.avail_in	= src_len,
+			.next_out	= dst_data,
+			.avail_out	= dst_len,
+		};
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != Z_STREAM_END)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_TYPE_zstd: {
+		ZSTD_DCtx *ctx;
+		size_t real_src_len = le32_to_cpup(src_data.b);
+
+		if (real_src_len > src_len - 4)
+			goto err;
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+		ret = zstd_decompress_dctx(ctx,
+				dst_data,	dst_len,
+				src_data.b + 4, real_src_len);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != dst_len)
+			goto err;
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	return ret;
+err:
+	ret = -EIO;
+	goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+				struct bch_extent_crc_unpacked *crc)
+{
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
+
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
+		return -EIO;
+	}
+
+	data = __bounce_alloc(c, dst_len, WRITE);
+
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		if (!c->opts.no_data_io)
+			bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * XXX: don't have a good way to assert that the bio was allocated with
+	 * enough space, we depend on bch2_move_extent doing the right thing
+	 */
+	bio->bi_iter.bi_size = crc->live_size << 9;
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf dst_data = { NULL };
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret;
+
+	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
+	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
+		return -EIO;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
+	if (ret)
+		goto err;
+
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+			    void *workspace,
+			    void *dst, size_t dst_len,
+			    void *src, size_t src_len,
+			    struct bch_compression_opt compression)
+{
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
+
+	switch (compression_type) {
+	case BCH_COMPRESSION_TYPE_lz4:
+		if (compression.level < LZ4HC_MIN_CLEVEL) {
+			int len = src_len;
+			int ret = LZ4_compress_destSize(
+					src,		dst,
+					&len,		dst_len,
+					workspace);
+			if (len < src_len)
+				return -len;
+
+			return ret;
+		} else {
+			int ret = LZ4_compress_HC(
+					src,		dst,
+					src_len,	dst_len,
+					compression.level,
+					workspace);
+
+			return ret ?: -1;
+		}
+	case BCH_COMPRESSION_TYPE_gzip: {
+		z_stream strm = {
+			.next_in	= src,
+			.avail_in	= src_len,
+			.next_out	= dst,
+			.avail_out	= dst_len,
+		};
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_deflateInit2(&strm,
+				  compression.level
+				  ? clamp_t(unsigned, compression.level,
+					    Z_BEST_SPEED, Z_BEST_COMPRESSION)
+				  : Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+			return 0;
+
+		if (zlib_deflateEnd(&strm) != Z_OK)
+			return 0;
+
+		return strm.total_out;
+	}
+	case BCH_COMPRESSION_TYPE_zstd: {
+		/*
+		 * rescale:
+		 * zstd max compression level is 22, our max level is 15
+		 */
+		unsigned level = min((compression.level * 3) / 2, zstd_max_clevel());
+		ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max);
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size);
+
+		/*
+		 * ZSTD requires that when we decompress we pass in the exact
+		 * compressed size - rounding it up to the nearest sector
+		 * doesn't work, so we use the first 4 bytes of the buffer for
+		 * that.
+		 *
+		 * Additionally, the ZSTD code seems to have a bug where it will
+		 * write just past the end of the buffer - so subtract a fudge
+		 * factor (7 bytes) from the dst buffer size to account for
+		 * that.
+		 */
+		size_t len = zstd_compress_cctx(ctx,
+				dst + 4,	dst_len - 4 - 7,
+				src,		src_len,
+				&params);
+		if (zstd_is_error(len))
+			return 0;
+
+		*((__le32 *) dst) = cpu_to_le32(len);
+		return len + 4;
+	}
+	default:
+		BUG();
+	}
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       struct bch_compression_opt compression)
+{
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	void *workspace;
+	enum bch_compression_type compression_type =
+		__bch2_compression_opt_to_type[compression.type];
+	unsigned pad;
+	int ret = 0;
+
+	BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
+	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (src->bi_iter.bi_size <= c->opts.block_size)
+		return BCH_COMPRESSION_TYPE_incompressible;
+
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
+
+	*src_len = src->bi_iter.bi_size;
+	*dst_len = dst->bi_iter.bi_size;
+
+	/*
+	 * XXX: this algorithm sucks when the compression code doesn't tell us
+	 * how much would fit, like LZ4 does:
+	 */
+	while (1) {
+		if (*src_len <= block_bytes(c)) {
+			ret = -1;
+			break;
+		}
+
+		ret = attempt_compress(c, workspace,
+				       dst_data.b,	*dst_len,
+				       src_data.b,	*src_len,
+				       compression);
+		if (ret > 0) {
+			*dst_len = ret;
+			ret = 0;
+			break;
+		}
+
+		/* Didn't fit: should we retry with a smaller amount?  */
+		if (*src_len <= *dst_len) {
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * If ret is negative, it's a hint as to how much data would fit
+		 */
+		BUG_ON(-ret >= *src_len);
+
+		if (ret < 0)
+			*src_len = -ret;
+		else
+			*src_len -= (*src_len - *dst_len) / 2;
+		*src_len = round_down(*src_len, block_bytes(c));
+	}
+
+	mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+	if (ret)
+		goto err;
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+		goto err;
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data.b + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_data.type != BB_NONE &&
+	    dst_data.type != BB_VMAP)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
+	ret = compression_type;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+err:
+	ret = BCH_COMPRESSION_TYPE_incompressible;
+	goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_opt)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+	unsigned compression_type;
+
+	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->opts.encoded_extent_max);
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len,
+			       bch2_compression_decode(compression_opt));
+
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+	return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_none	0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+#undef BCH_FEATURE_none
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+	int ret = 0;
+
+	if ((c->sb.features & f) == f)
+		return 0;
+
+	mutex_lock(&c->sb_lock);
+
+	if ((c->sb.features & f) == f) {
+		mutex_unlock(&c->sb_lock);
+		return 0;
+	}
+
+	ret = __bch2_fs_compress_init(c, c->sb.features|f);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ret;
+	}
+
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+				       unsigned compression_opt)
+{
+	unsigned compression_type = bch2_compression_decode(compression_opt).type;
+
+	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+	return compression_type
+		? __bch2_check_set_has_compressed_data(c,
+				1ULL << bch2_compression_opt_to_feature[compression_type])
+		: 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	mempool_exit(&c->decompress_workspace);
+	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+		mempool_exit(&c->compress_workspace[i]);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	size_t decompress_workspace_size = 0;
+	ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
+						 c->opts.encoded_extent_max);
+
+	/*
+	 * ZSTD is lying: if we allocate the size of the workspace it says it
+	 * requires, it returns memory allocation errors
+	 */
+	c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
+
+	struct {
+		unsigned			feature;
+		enum bch_compression_type	type;
+		size_t				compress_workspace;
+		size_t				decompress_workspace;
+	} compression_types[] = {
+		{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
+			max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+			0 },
+		{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
+			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			zlib_inflate_workspacesize(), },
+		{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
+			c->zstd_workspace_size,
+			zstd_dctx_workspace_bound() },
+	}, *i;
+	bool have_compressed = false;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++)
+		have_compressed |= (features & (1 << i->feature)) != 0;
+
+	if (!have_compressed)
+		return 0;
+
+	if (!mempool_initialized(&c->compression_bounce[READ]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+
+	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
+	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
+					1, c->opts.encoded_extent_max))
+		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++) {
+		decompress_workspace_size =
+			max(decompress_workspace_size, i->decompress_workspace);
+
+		if (!(features & (1 << i->feature)))
+			continue;
+
+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
+		if (mempool_init_kvpmalloc_pool(
+				&c->compress_workspace[i->type],
+				1, i->compress_workspace))
+			return -BCH_ERR_ENOMEM_compression_workspace_init;
+	}
+
+	if (!mempool_initialized(&c->decompress_workspace) &&
+	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
+					1, decompress_workspace_size))
+		return -BCH_ERR_ENOMEM_decompression_workspace_init;
+
+	return 0;
+}
+
+static u64 compression_opt_to_feature(unsigned v)
+{
+	unsigned type = bch2_compression_decode(v).type;
+
+	return BIT_ULL(bch2_compression_opt_to_feature[type]);
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+	u64 f = c->sb.features;
+
+	f |= compression_opt_to_feature(c->opts.compression);
+	f |= compression_opt_to_feature(c->opts.background_compression);
+
+	return __bch2_fs_compress_init(c, f);
+}
+
+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
+			       struct printbuf *err)
+{
+	char *val = kstrdup(_val, GFP_KERNEL);
+	char *p = val, *type_str, *level_str;
+	struct bch_compression_opt opt = { 0 };
+	int ret;
+
+	if (!val)
+		return -ENOMEM;
+
+	type_str = strsep(&p, ":");
+	level_str = p;
+
+	ret = match_string(bch2_compression_opts, -1, type_str);
+	if (ret < 0 && err)
+		prt_str(err, "invalid compression type");
+	if (ret < 0)
+		goto err;
+
+	opt.type = ret;
+
+	if (level_str) {
+		unsigned level;
+
+		ret = kstrtouint(level_str, 10, &level);
+		if (!ret && !opt.type && level)
+			ret = -EINVAL;
+		if (!ret && level > 15)
+			ret = -EINVAL;
+		if (ret < 0 && err)
+			prt_str(err, "invalid compression level");
+		if (ret < 0)
+			goto err;
+
+		opt.level = level;
+	}
+
+	*res = bch2_compression_encode(opt);
+err:
+	kfree(val);
+	return ret;
+}
+
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+	struct bch_compression_opt opt = bch2_compression_decode(v);
+
+	if (opt.type < BCH_COMPRESSION_OPT_NR)
+		prt_str(out, bch2_compression_opts[opt.type]);
+	else
+		prt_printf(out, "(unknown compression opt %u)", opt.type);
+	if (opt.level)
+		prt_printf(out, ":%u", opt.level);
+}
+
+void bch2_opt_compression_to_text(struct printbuf *out,
+				  struct bch_fs *c,
+				  struct bch_sb *sb,
+				  u64 v)
+{
+	return bch2_compression_opt_to_text(out, v);
+}
+
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+	if (!bch2_compression_opt_valid(v)) {
+		prt_printf(err, "invalid compression opt %llu", v);
+		return -BCH_ERR_invalid_sb_opt_compression;
+	}
+
+	return 0;
+}
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
new file mode 100644
index 000000000000..607fd5e232c9
--- /dev/null
+++ b/fs/bcachefs/compress.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+	BCH_COMPRESSION_OPTS()
+#undef x
+};
+
+struct bch_compression_opt {
+	u8		type:4,
+			level:4;
+};
+
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
+{
+	return (struct bch_compression_opt) {
+		.type	= v & 15,
+		.level	= v >> 4,
+	};
+}
+
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+	struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+	return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+	return bch2_compression_opt_valid(v)
+		? __bch2_compression_decode(v)
+		: (struct bch_compression_opt) { 0 };
+}
+
+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
+{
+	return opt.type|(opt.level << 4);
+}
+
+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
+{
+	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
+
+#define bch2_opt_compression (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_compression_parse,		\
+	.to_text	= bch2_opt_compression_to_text,		\
+	.validate	= bch2_opt_compression_validate,	\
+}
+
+#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c
new file mode 100644
index 000000000000..02a996e06a64
--- /dev/null
+++ b/fs/bcachefs/counters.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+static const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+	if (!ctrs)
+		return 0;
+
+	return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+				     struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	return 0;
+};
+
+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+			      struct bch_sb_field *f)
+{
+	struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	for (i = 0; i < nr; i++) {
+		if (i < BCH_COUNTER_NR)
+			prt_printf(out, "%s ", bch2_counter_names[i]);
+		else
+			prt_printf(out, "(unknown)");
+
+		prt_tab(out);
+		prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+		prt_newline(out);
+	}
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+	u64 val = 0;
+
+	for (i = 0; i < BCH_COUNTER_NR; i++)
+		c->counters_on_mount[i] = 0;
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+		val = le64_to_cpu(ctrs->d[i]);
+		percpu_u64_set(&c->counters[i], val);
+		c->counters_on_mount[i] = val;
+	}
+	return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
+	struct bch_sb_field_counters *ret;
+	unsigned int i;
+	unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+	if (nr < BCH_COUNTER_NR) {
+		ret = bch2_sb_field_resize(&c->disk_sb, counters,
+					       sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+		if (ret) {
+			ctrs = ret;
+			nr = bch2_sb_counter_nr_entries(ctrs);
+		}
+	}
+
+
+	for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+		ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+	return 0;
+}
+
+void bch2_fs_counters_exit(struct bch_fs *c)
+{
+	free_percpu(c->counters);
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+	c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+	if (!c->counters)
+		return -BCH_ERR_ENOMEM_fs_counters_init;
+
+	return bch2_sb_counters_to_cpu(c);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+	.validate	= bch2_sb_counters_validate,
+	.to_text	= bch2_sb_counters_to_text,
+};
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h
new file mode 100644
index 000000000000..4778aa19bf34
--- /dev/null
+++ b/fs/bcachefs/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *);
+int bch2_sb_counters_from_cpu(struct bch_fs *);
+
+void bch2_fs_counters_exit(struct bch_fs *);
+int bch2_fs_counters_init(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
new file mode 100644
index 000000000000..ac35b8b705ae
--- /dev/null
+++ b/fs/bcachefs/darray.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include "darray.h"
+
+int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+	if (new_size > d->size) {
+		new_size = roundup_pow_of_two(new_size);
+
+		void *data = kvmalloc_array(new_size, element_size, gfp);
+		if (!data)
+			return -ENOMEM;
+
+		memcpy(data, d->data, d->size * element_size);
+		if (d->data != d->preallocated)
+			kvfree(d->data);
+		d->data	= data;
+		d->size = new_size;
+	}
+
+	return 0;
+}
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
new file mode 100644
index 000000000000..e367c625f057
--- /dev/null
+++ b/fs/bcachefs/darray.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DARRAY_H
+#define _BCACHEFS_DARRAY_H
+
+/*
+ * Dynamic arrays:
+ *
+ * Inspired by CCAN's darray
+ */
+
+#include <linux/slab.h>
+
+#define DARRAY_PREALLOCATED(_type, _nr)					\
+struct {								\
+	size_t nr, size;						\
+	_type *data;							\
+	_type preallocated[_nr];					\
+}
+
+#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
+
+typedef DARRAY(char)	darray_char;
+
+int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_char *d, size_t element_size,
+				  size_t new_size, gfp_t gfp)
+{
+	return unlikely(new_size > d->size)
+		? __bch2_darray_resize(d, element_size, new_size, gfp)
+		: 0;
+}
+
+#define darray_resize_gfp(_d, _new_size, _gfp)				\
+	unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+
+#define darray_resize(_d, _new_size)					\
+	darray_resize_gfp(_d, _new_size, GFP_KERNEL)
+
+static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
+{
+	return __darray_resize(d, t_size, d->nr + more, gfp);
+}
+
+#define darray_make_room_gfp(_d, _more, _gfp)				\
+	__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+
+#define darray_make_room(_d, _more)					\
+	darray_make_room_gfp(_d, _more, GFP_KERNEL)
+
+#define darray_room(_d)		((_d).size - (_d).nr)
+
+#define darray_top(_d)		((_d).data[(_d).nr])
+
+#define darray_push_gfp(_d, _item, _gfp)				\
+({									\
+	int _ret = darray_make_room_gfp((_d), 1, _gfp);			\
+									\
+	if (!_ret)							\
+		(_d)->data[(_d)->nr++] = (_item);			\
+	_ret;								\
+})
+
+#define darray_push(_d, _item)	darray_push_gfp(_d, _item, GFP_KERNEL)
+
+#define darray_pop(_d)		((_d)->data[--(_d)->nr])
+
+#define darray_first(_d)	((_d).data[0])
+#define darray_last(_d)		((_d).data[(_d).nr - 1])
+
+#define darray_insert_item(_d, pos, _item)				\
+({									\
+	size_t _pos = (pos);						\
+	int _ret = darray_make_room((_d), 1);				\
+									\
+	if (!_ret)							\
+		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
+	_ret;								\
+})
+
+#define darray_remove_item(_d, _pos)					\
+	array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
+#define darray_for_each(_d, _i)						\
+	for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
+#define darray_for_each_reverse(_d, _i)					\
+	for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
+#define darray_init(_d)							\
+do {									\
+	(_d)->nr = 0;							\
+	(_d)->size = ARRAY_SIZE((_d)->preallocated);			\
+	(_d)->data = (_d)->size ? (_d)->preallocated : NULL;		\
+} while (0)
+
+#define darray_exit(_d)							\
+do {									\
+	if (!ARRAY_SIZE((_d)->preallocated) ||				\
+	    (_d)->data != (_d)->preallocated)				\
+		kvfree((_d)->data);					\
+	darray_init(_d);						\
+} while (0)
+
+#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
new file mode 100644
index 000000000000..37d6ecae8c30
--- /dev/null
+++ b/fs/bcachefs/data_update.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "ec.h"
+#include "error.h"
+#include "extents.h"
+#include "io_write.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_finish_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_finish(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_fail2(struct data_update *m,
+			 struct bkey_s_c new,
+			 struct bkey_s_c wrote,
+			 struct bkey_i *insert,
+			 const char *msg)
+{
+	struct bch_fs *c = m->op.c;
+	struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_ptr *ptr;
+	struct extent_ptr_decoded p;
+	struct printbuf buf = PRINTBUF;
+	unsigned i, rewrites_found = 0;
+
+	if (!trace_move_extent_fail_enabled())
+		return;
+
+	prt_str(&buf, msg);
+
+	if (insert) {
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached)
+				rewrites_found |= 1U << i;
+			i++;
+		}
+	}
+
+	prt_printf(&buf, "\nrewrite ptrs:   %u%u%u%u",
+		   (m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
+		   (m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
+
+	prt_printf(&buf, "\nrewrites found: %u%u%u%u",
+		   (rewrites_found & (1 << 0)) != 0,
+		   (rewrites_found & (1 << 1)) != 0,
+		   (rewrites_found & (1 << 2)) != 0,
+		   (rewrites_found & (1 << 3)) != 0);
+
+	prt_str(&buf, "\nold:    ");
+	bch2_bkey_val_to_text(&buf, c, old);
+
+	prt_str(&buf, "\nnew:    ");
+	bch2_bkey_val_to_text(&buf, c, new);
+
+	prt_str(&buf, "\nwrote:  ");
+	bch2_bkey_val_to_text(&buf, c, wrote);
+
+	if (insert) {
+		prt_str(&buf, "\ninsert: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+	}
+
+	trace_move_extent_fail(c, buf.buf);
+	printbuf_exit(&buf);
+}
+
+static int __bch2_data_update_index_update(struct btree_trans *trans,
+					   struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_iter iter;
+	struct data_update *m =
+		container_of(op, struct data_update, op);
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_buf _new, _insert;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&_new);
+	bch2_bkey_buf_init(&_insert);
+	bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
+	bch2_trans_iter_init(trans, &iter, m->btree_id,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k;
+		struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
+		struct bkey_i *insert = NULL;
+		struct bkey_i_extent *new;
+		const union bch_extent_entry *entry_c;
+		union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		struct bch_extent_ptr *ptr;
+		const struct bch_extent_ptr *ptr_c;
+		struct bpos next_pos;
+		bool should_check_enospc;
+		s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+		unsigned rewrites_found = 0, durability, i;
+
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		new = bkey_i_to_extent(bch2_keylist_front(keys));
+
+		if (!bch2_extents_match(k, old)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+						NULL, "no match:");
+			goto nowork;
+		}
+
+		bkey_reassemble(_insert.k, k);
+		insert = _insert.k;
+
+		bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(_new.k);
+		bch2_cut_front(iter.pos, &new->k_i);
+
+		bch2_cut_front(iter.pos,	insert);
+		bch2_cut_back(new->k.p,		insert);
+		bch2_cut_back(insert->k.p,	&new->k_i);
+
+		/*
+		 * @old: extent that we read from
+		 * @insert: key that we're going to update, initialized from
+		 * extent currently in btree - same as @old unless we raced with
+		 * other updates
+		 * @new: extent with new pointers that we'll be adding to @insert
+		 *
+		 * Fist, drop rewrite_ptrs from @new:
+		 */
+		i = 0;
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
+			if (((1U << i) & m->data_opts.rewrite_ptrs) &&
+			    (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+			    !ptr->cached) {
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+				rewrites_found |= 1U << i;
+			}
+			i++;
+		}
+
+		if (m->data_opts.rewrite_ptrs &&
+		    !rewrites_found &&
+		    bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+			goto nowork;
+		}
+
+		/*
+		 * A replica that we just wrote might conflict with a replica
+		 * that we want to keep, due to racing with another move:
+		 */
+restart_drop_conflicting_replicas:
+		extent_for_each_ptr(extent_i_to_s(new), ptr)
+			if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+			    !ptr_c->cached) {
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+				goto restart_drop_conflicting_replicas;
+			}
+
+		if (!bkey_val_u64s(&new->k)) {
+			trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+			goto nowork;
+		}
+
+		/* Now, drop pointers that conflict with what we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+				bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+		durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+			bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+		/* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+		bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+			unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+			if (!p.ptr.cached &&
+			    durability - ptr_durability >= m->op.opts.data_replicas) {
+				durability -= ptr_durability;
+
+				bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+				goto restart_drop_extra_replicas;
+			}
+		}
+
+		/* Finally, add the pointers we just wrote: */
+		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+			bch2_extent_ptr_decoded_append(insert, &p);
+
+		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, bkey_i_to_s(insert));
+
+		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
+						 &should_check_enospc,
+						 &i_sectors_delta,
+						 &disk_sectors_delta);
+		if (ret)
+			goto err;
+
+		if (disk_sectors_delta > (s64) op->res.sectors) {
+			ret = bch2_disk_reservation_add(c, &op->res,
+						disk_sectors_delta - op->res.sectors,
+						!should_check_enospc
+						? BCH_DISK_RESERVATION_NOFAIL : 0);
+			if (ret)
+				goto out;
+		}
+
+		next_pos = insert->k.p;
+
+		/*
+		 * Check for nonce offset inconsistency:
+		 * This is debug code - we've been seeing this bug rarely, and
+		 * it's been hard to reproduce, so this should give us some more
+		 * information when it does occur:
+		 */
+		struct printbuf err = PRINTBUF;
+		int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+		printbuf_exit(&err);
+
+		if (invalid) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "about to insert invalid key in data update path");
+			prt_str(&buf, "\nold: ");
+			bch2_bkey_val_to_text(&buf, c, old);
+			prt_str(&buf, "\nk:   ");
+			bch2_bkey_val_to_text(&buf, c, k);
+			prt_str(&buf, "\nnew: ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+			bch2_print_string_as_lines(KERN_ERR, buf.buf);
+			printbuf_exit(&buf);
+
+			bch2_fatal_error(c);
+			goto out;
+		}
+
+		ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, bkey_start_pos(&insert->k)) ?:
+			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+						k.k->p, insert->k.p) ?:
+			bch2_bkey_set_needs_rebalance(c, insert,
+						      op->opts.background_target,
+						      op->opts.background_compression) ?:
+			bch2_trans_update(trans, &iter, insert,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_trans_commit(trans, &op->res,
+				NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL|
+				m->data_opts.btree_insert_flags);
+		if (!ret) {
+			bch2_btree_iter_set_pos(&iter, next_pos);
+
+			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+			trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i));
+		}
+err:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+		continue;
+nowork:
+		if (m->stats && m->stats) {
+			BUG_ON(k.k->p.offset <= iter.pos.offset);
+			atomic64_inc(&m->stats->keys_raced);
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->stats->sectors_raced);
+		}
+
+		this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+
+		bch2_btree_iter_advance(&iter);
+		goto next;
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_bkey_buf_exit(&_insert, c);
+	bch2_bkey_buf_exit(&_new, c);
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+	return ret;
+}
+
+int bch2_data_update_index_update(struct bch_write_op *op)
+{
+	return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
+}
+
+void bch2_data_update_read_done(struct data_update *m,
+				struct bch_extent_crc_unpacked crc)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->op.crc = crc;
+	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+
+	closure_call(&m->op.cl, bch2_write, NULL, NULL);
+}
+
+void bch2_data_update_exit(struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bkey_ptrs_c ptrs =
+		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (c->opts.nocow_enabled)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						 PTR_BUCKET_POS(c, ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+	}
+
+	bch2_bkey_buf_exit(&update->k, c);
+	bch2_disk_reservation_put(c, &update->op.res);
+	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+}
+
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
+				  struct data_update *update)
+{
+	struct bch_fs *c = update->op.c;
+	struct bio *bio = &update->op.wbio.bio;
+	struct bkey_i_extent *e;
+	struct write_point *wp;
+	struct bch_extent_ptr *ptr;
+	struct closure cl;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	closure_init_stack(&cl);
+	bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
+
+	while (bio_sectors(bio)) {
+		unsigned sectors = bio_sectors(bio);
+
+		bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
+				     BTREE_ITER_SLOTS);
+		ret = lockrestart_do(trans, ({
+			k = bch2_btree_iter_peek_slot(&iter);
+			bkey_err(k);
+		}));
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k)))
+			break;
+
+		e = bkey_extent_init(update->op.insert_keys.top);
+		e->k.p = update->op.pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				update->op.target,
+				false,
+				update->op.write_point,
+				&update->op.devs_have,
+				update->op.nr_replicas,
+				update->op.nr_replicas,
+				update->op.watermark,
+				0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) {
+			bch2_trans_unlock(trans);
+			closure_sync(&cl);
+			continue;
+		}
+
+		if (ret)
+			return;
+
+		sectors = min(sectors, wp->sectors_free);
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &update->op.open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		bio_advance(bio, sectors << 9);
+		update->op.pos.offset += sectors;
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+		bch2_keylist_push(&update->op.insert_keys);
+
+		ret = __bch2_data_update_index_update(trans, &update->op);
+
+		bch2_open_buckets_put(c, &update->op.open_buckets);
+
+		if (ret)
+			break;
+	}
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+}
+
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k,
+			  struct data_update_opts data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	n = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	while (data_opts.kill_ptrs) {
+		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+		struct bch_extent_ptr *ptr;
+
+		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+		data_opts.kill_ptrs ^= 1U << drop;
+	}
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+		n->k.size = 0;
+
+	return bch2_trans_relock(trans) ?:
+		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+int bch2_data_update_init(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct moving_context *ctxt,
+			  struct data_update *m,
+			  struct write_point_specifier wp,
+			  struct bch_io_opts io_opts,
+			  struct data_update_opts data_opts,
+			  enum btree_id btree_id,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	const struct bch_extent_ptr *ptr;
+	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
+	unsigned ptrs_locked = 0;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&m->k);
+	bch2_bkey_buf_reassemble(&m->k, c, k);
+	m->btree_id	= btree_id;
+	m->data_opts	= data_opts;
+	m->ctxt		= ctxt;
+	m->stats	= ctxt ? ctxt->stats : NULL;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.pos	= bkey_start_pos(k.k);
+	m->op.version	= k.k->version;
+	m->op.target	= data_opts.target;
+	m->op.write_point = wp;
+	m->op.nr_replicas = 0;
+	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_MOVE|
+		m->data_opts.write_flags;
+	m->op.compression_opt	= io_opts.background_compression ?: io_opts.compression;
+	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+
+	unsigned durability_have = 0, durability_removing = 0;
+
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		bool locked;
+
+		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
+			BUG_ON(p.ptr.cached);
+
+			if (crc_is_compressed(p.crc))
+				reserve_sectors += k.k->size;
+
+			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+		} else if (!p.ptr.cached &&
+			   !((1U << i) & m->data_opts.kill_ptrs)) {
+			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+			durability_have += bch2_extent_ptr_durability(c, &p);
+		}
+
+		/*
+		 * op->csum_type is normally initialized from the fs/file's
+		 * current options - but if an extent is encrypted, we require
+		 * that it stays encrypted:
+		 */
+		if (bch2_csum_type_is_encryption(p.crc.csum_type)) {
+			m->op.nonce	= p.crc.nonce + p.crc.offset;
+			m->op.csum_type = p.crc.csum_type;
+		}
+
+		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			m->op.incompressible = true;
+
+		if (c->opts.nocow_enabled) {
+			if (ctxt) {
+				move_ctxt_wait_event(ctxt,
+						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
+									  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
+						(!atomic_read(&ctxt->read_sectors) &&
+						 !atomic_read(&ctxt->write_sectors)));
+
+				if (!locked)
+					bch2_bucket_nocow_lock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0);
+			} else {
+				if (!bch2_bucket_nocow_trylock(&c->nocow_locks,
+							       PTR_BUCKET_POS(c, &p.ptr), 0)) {
+					ret = -BCH_ERR_nocow_lock_blocked;
+					goto err;
+				}
+			}
+			ptrs_locked |= (1U << i);
+		}
+
+		i++;
+	}
+
+	/*
+	 * If current extent durability is less than io_opts.data_replicas,
+	 * we're not trying to rereplicate the extent up to data_replicas here -
+	 * unless extra_replicas was specified
+	 *
+	 * Increasing replication is an explicit operation triggered by
+	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+	 */
+	if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
+	    durability_have >= io_opts.data_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+		goto done;
+	}
+
+	m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+		m->data_opts.extra_replicas;
+	m->op.nr_replicas_required = m->op.nr_replicas;
+
+	BUG_ON(!m->op.nr_replicas);
+
+	if (reserve_sectors) {
+		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
+				m->data_opts.extra_replicas
+				? 0
+				: BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			goto err;
+	}
+
+	if (bkey_extent_is_unwritten(k)) {
+		bch2_update_unwritten_extent(trans, m);
+		goto done;
+	}
+
+	return 0;
+err:
+	i = 0;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if ((1U << i) & ptrs_locked)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+						 PTR_BUCKET_POS(c, &p.ptr), 0);
+		percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref);
+		i++;
+	}
+
+	bch2_bkey_buf_exit(&m->k, c);
+	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
+	return ret;
+done:
+	bch2_data_update_exit(m);
+	return ret ?: -BCH_ERR_data_update_done;
+}
+
+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i = 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
+			opts->kill_ptrs |= 1U << i;
+			opts->rewrite_ptrs ^= 1U << i;
+		}
+
+		i++;
+	}
+}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
new file mode 100644
index 000000000000..991095bbd469
--- /dev/null
+++ b/fs/bcachefs/data_update.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BCACHEFS_DATA_UPDATE_H
+#define _BCACHEFS_DATA_UPDATE_H
+
+#include "bkey_buf.h"
+#include "io_write_types.h"
+
+struct moving_context;
+
+struct data_update_opts {
+	unsigned	rewrite_ptrs;
+	unsigned	kill_ptrs;
+	u16		target;
+	u8		extra_replicas;
+	unsigned	btree_insert_flags;
+	unsigned	write_flags;
+};
+
+struct data_update {
+	/* extent being updated: */
+	enum btree_id		btree_id;
+	struct bkey_buf		k;
+	struct data_update_opts	data_opts;
+	struct moving_context	*ctxt;
+	struct bch_move_stats	*stats;
+	struct bch_write_op	op;
+};
+
+int bch2_data_update_index_update(struct bch_write_op *);
+
+void bch2_data_update_read_done(struct data_update *,
+				struct bch_extent_crc_unpacked);
+
+int bch2_extent_drop_ptrs(struct btree_trans *,
+			  struct btree_iter *,
+			  struct bkey_s_c,
+			  struct data_update_opts);
+
+void bch2_data_update_exit(struct data_update *);
+int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
+			  struct moving_context *,
+			  struct data_update *,
+			  struct write_point_specifier,
+			  struct bch_io_opts, struct data_update_opts,
+			  enum btree_id, struct bkey_s_c);
+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
+
+#endif /* _BCACHEFS_DATA_UPDATE_H */
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
new file mode 100644
index 000000000000..57c5128db173
--- /dev/null
+++ b/fs/bcachefs/debug.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+				      struct extent_ptr_decoded pick)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk = c->verify_ondisk;
+	struct btree_node *n_sorted = c->verify_data->data;
+	struct bset *sorted, *inmemory = &b->data->keys;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	struct bio *bio;
+	bool failed = false, saw_error = false;
+
+	if (!bch2_dev_get_ioref(ca, READ))
+		return false;
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_sorted, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_sorted, btree_bytes(c));
+
+	submit_bio_wait(bio);
+
+	bio_put(bio);
+	percpu_ref_put(&ca->io_ref);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	v->written = 0;
+	if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+		return false;
+
+	n_sorted = c->verify_data->data;
+	sorted = &n_sorted->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   vstruct_end(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch2_dump_bset(c, b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch2_dump_bset(c, v, sorted, 0);
+
+		while (offset < v->written) {
+			if (!offset) {
+				i = &n_ondisk->keys;
+				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = vstruct_blocks(bne, c->block_bits) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch2_dump_bset(c, b, i, offset);
+
+			offset += sectors;
+		}
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		console_unlock();
+		bch_err(c, "verify failed at key %u", j);
+
+		failed = true;
+	}
+
+	if (v->written != b->written) {
+		bch_err(c, "written wrong: expected %u, got %u",
+			b->written, v->written);
+		failed = true;
+	}
+
+	return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct bkey_ptrs_c ptrs;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	struct btree *v;
+	struct bset *inmemory = &b->data->keys;
+	struct bkey_packed *k;
+	bool failed = false;
+
+	if (c->opts.nochanges)
+		return;
+
+	bch2_btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	if (!c->verify_ondisk) {
+		c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+		if (!c->verify_ondisk)
+			goto out;
+	}
+
+	if (!c->verify_data) {
+		c->verify_data = __bch2_btree_node_mem_alloc(c);
+		if (!c->verify_data)
+			goto out;
+
+		list_del_init(&c->verify_data->list);
+	}
+
+	BUG_ON(b->nsets != 1);
+
+	for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
+		if (k->type == KEY_TYPE_btree_ptr_v2)
+			((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
+
+	v = c->verify_data;
+	bkey_copy(&v->key, &b->key);
+	v->c.level	= b->c.level;
+	v->c.btree_id	= b->c.btree_id;
+	bch2_btree_keys_init(v);
+
+	ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+	bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+		failed |= bch2_btree_verify_replica(c, b, p);
+
+	if (failed) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		printbuf_exit(&buf);
+	}
+out:
+	mutex_unlock(&c->verify_lock);
+	bch2_btree_node_io_unlock(b);
+}
+
+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
+				    const struct btree *b)
+{
+	struct btree_node *n_ondisk = NULL;
+	struct extent_ptr_decoded pick;
+	struct bch_dev *ca;
+	struct bio *bio = NULL;
+	unsigned offset = 0;
+	int ret;
+
+	if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+		prt_printf(out, "error getting device to read from: invalid device\n");
+		return;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ)) {
+		prt_printf(out, "error getting device to read from: not online\n");
+		return;
+	}
+
+	n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!n_ondisk) {
+		prt_printf(out, "memory allocation failure\n");
+		goto out;
+	}
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_ondisk, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOFS,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+
+	ret = submit_bio_wait(bio);
+	if (ret) {
+		prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret));
+		goto out;
+	}
+
+	while (offset < btree_sectors(c)) {
+		struct bset *i;
+		struct nonce nonce;
+		struct bch_csum csum;
+		struct bkey_packed *k;
+		unsigned sectors;
+
+		if (!offset) {
+			i = &n_ondisk->keys;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk);
+
+			if (bch2_crc_cmp(csum, n_ondisk->csum)) {
+				prt_printf(out, "invalid checksum\n");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(n_ondisk, c->block_bits);
+		} else {
+			struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9);
+
+			i = &bne->keys;
+
+			if (i->seq != n_ondisk->keys.seq)
+				break;
+
+			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) {
+				prt_printf(out, "unknown checksum type at offset %u: %llu\n",
+					   offset, BSET_CSUM_TYPE(i));
+				goto out;
+			}
+
+			nonce = btree_nonce(i, offset << 9);
+			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
+			if (bch2_crc_cmp(csum, bne->csum)) {
+				prt_printf(out, "invalid checksum");
+				goto out;
+			}
+
+			bset_encrypt(c, i, offset << 9);
+
+			sectors = vstruct_sectors(bne, c->block_bits);
+		}
+
+		prt_printf(out, "  offset %u version %u, journal seq %llu\n",
+			   offset,
+			   le16_to_cpu(i->version),
+			   le64_to_cpu(i->journal_seq));
+		offset += sectors;
+
+		printbuf_indent_add(out, 4);
+
+		for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) {
+			struct bkey u;
+
+			bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u));
+			prt_newline(out);
+		}
+
+		printbuf_indent_sub(out, 4);
+	}
+out:
+	if (bio)
+		bio_put(bio);
+	kvpfree(n_ondisk, btree_bytes(c));
+	percpu_ref_put(&ca->io_ref);
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+	struct bch_fs		*c;
+	enum btree_id		id;
+	struct bpos		from;
+	struct bpos		prev_node;
+	u64			iter;
+
+	struct printbuf		buf;
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static ssize_t flush_buf(struct dump_iter *i)
+{
+	if (i->buf.pos) {
+		size_t bytes = min_t(size_t, i->buf.pos, i->size);
+		int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
+
+		i->ret	 += copied;
+		i->ubuf	 += copied;
+		i->size	 -= copied;
+		i->buf.pos -= copied;
+		memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+
+		if (copied != bytes)
+			return -EFAULT;
+	}
+
+	return i->size ? 0 : i->ret;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->iter	= 0;
+	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
+	i->id	= bd->id;
+	i->buf	= PRINTBUF;
+
+	return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+	return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+			       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(i->c);
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
+		bch2_bkey_val_to_text(&i->buf, i->c, k);
+		prt_newline(&i->buf);
+		drop_locks_do(trans, flush_buf(i));
+	}));
+	i->from = iter.pos;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct btree *b;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(SPOS_MAX, i->from))
+		return i->ret;
+
+	trans = bch2_trans_get(i->c);
+retry:
+	bch2_trans_begin(trans);
+
+	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
+		bch2_btree_node_to_text(&i->buf, i->c, b);
+		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+			? bpos_successor(b->key.k.p)
+			: b->key.k.p;
+
+		ret = drop_locks_do(trans, flush_buf(i));
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	ssize_t ret;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	ret = flush_buf(i);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(i->c);
+
+	ret = for_each_btree_key2(trans, iter, i->id, i->from,
+				  BTREE_ITER_PREFETCH|
+				  BTREE_ITER_ALL_SNAPSHOTS, k, ({
+		struct btree_path_level *l = &iter.path->l[0];
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+
+		if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+			bch2_btree_node_to_text(&i->buf, i->c, l->b);
+			i->prev_node = l->b->key.k.p;
+		}
+
+		bch2_bfloat_to_text(&i->buf, l->b, _k);
+		drop_locks_do(trans, flush_buf(i));
+	}));
+	i->from = iter.pos;
+
+	bch2_trans_put(trans);
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
+					   struct btree *b)
+{
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	prt_printf(out, "%px btree=%s l=%u ",
+	       b,
+	       bch2_btree_id_str(b->c.btree_id),
+	       b->c.level);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+	prt_newline(out);
+
+	prt_printf(out, "flags: ");
+	prt_tab(out);
+	prt_bitflags(out, bch2_btree_node_flags, b->flags);
+	prt_newline(out);
+
+	prt_printf(out, "pcpu read locks: ");
+	prt_tab(out);
+	prt_printf(out, "%u", b->c.lock.readers != NULL);
+	prt_newline(out);
+
+	prt_printf(out, "written:");
+	prt_tab(out);
+	prt_printf(out, "%u", b->written);
+	prt_newline(out);
+
+	prt_printf(out, "writes blocked:");
+	prt_tab(out);
+	prt_printf(out, "%u", !list_empty_careful(&b->write_blocked));
+	prt_newline(out);
+
+	prt_printf(out, "will make reachable:");
+	prt_tab(out);
+	prt_printf(out, "%lx", b->will_make_reachable);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[0].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[0].journal.seq);
+	prt_newline(out);
+
+	prt_printf(out, "journal pin %px:", &b->writes[1].journal);
+	prt_tab(out);
+	prt_printf(out, "%llu", b->writes[1].journal.seq);
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	ssize_t ret = 0;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		struct bucket_table *tbl;
+		struct rhash_head *pos;
+		struct btree *b;
+
+		ret = flush_buf(i);
+		if (ret)
+			return ret;
+
+		rcu_read_lock();
+		i->buf.atomic++;
+		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
+					  &c->btree_cache.table);
+		if (i->iter < tbl->size) {
+			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+				bch2_cached_btree_node_to_text(&i->buf, c, b);
+			i->iter++;
+		} else {
+			done = true;
+		}
+		--i->buf.atomic;
+		rcu_read_unlock();
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations cached_btree_nodes_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_cached_btree_nodes_read,
+};
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+	u32 seq;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->locking_wait.task->pid <= i->iter)
+			continue;
+
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
+		ret = flush_buf(i);
+		if (ret) {
+			closure_put(&trans->ref);
+			goto unlocked;
+		}
+
+		bch2_btree_trans_to_text(&i->buf, trans);
+
+		prt_printf(&i->buf, "backtrace:");
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 2);
+		bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+
+		i->iter = trans->locking_wait.task->pid;
+
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+unlocked:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_transactions_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_transactions_read,
+};
+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
+
+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	bool done = false;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	do {
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
+		i->iter++;
+	} while (!done);
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations journal_pins_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_journal_pins_read,
+};
+
+static int lock_held_stats_open(struct inode *inode, struct file *file)
+{
+	struct bch_fs *c = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+
+	if (!i)
+		return -ENOMEM;
+
+	i->iter = 0;
+	i->c    = c;
+	i->buf  = PRINTBUF;
+	file->private_data = i;
+
+	return 0;
+}
+
+static int lock_held_stats_release(struct inode *inode, struct file *file)
+{
+	struct dump_iter *i = file->private_data;
+
+	printbuf_exit(&i->buf);
+	kfree(i);
+
+	return 0;
+}
+
+static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
+				      size_t size, loff_t *ppos)
+{
+	struct dump_iter        *i = file->private_data;
+	struct bch_fs *c = i->c;
+	int err;
+
+	i->ubuf = buf;
+	i->size = size;
+	i->ret  = 0;
+
+	while (1) {
+		struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
+
+		err = flush_buf(i);
+		if (err)
+			return err;
+
+		if (!i->size)
+			break;
+
+		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
+		    !bch2_btree_transaction_fns[i->iter])
+			break;
+
+		prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
+		prt_newline(&i->buf);
+		printbuf_indent_add(&i->buf, 2);
+
+		mutex_lock(&s->lock);
+
+		prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+		prt_newline(&i->buf);
+
+		if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
+			prt_printf(&i->buf, "Lock hold times:");
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			bch2_time_stats_to_text(&i->buf, &s->lock_hold_times);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		if (s->max_paths_text) {
+			prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths);
+			prt_newline(&i->buf);
+
+			printbuf_indent_add(&i->buf, 2);
+			prt_str_indented(&i->buf, s->max_paths_text);
+			printbuf_indent_sub(&i->buf, 2);
+		}
+
+		mutex_unlock(&s->lock);
+
+		printbuf_indent_sub(&i->buf, 2);
+		prt_newline(&i->buf);
+		i->iter++;
+	}
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations lock_held_stats_op = {
+	.owner = THIS_MODULE,
+	.open = lock_held_stats_open,
+	.release = lock_held_stats_release,
+	.read = lock_held_stats_read,
+};
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+					    size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	struct btree_trans *trans;
+	ssize_t ret = 0;
+	u32 seq;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (i->iter)
+		goto out;
+restart:
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		if (trans->locking_wait.task->pid <= i->iter)
+			continue;
+
+		closure_get(&trans->ref);
+		seq = seqmutex_seq(&c->btree_trans_lock);
+		seqmutex_unlock(&c->btree_trans_lock);
+
+		ret = flush_buf(i);
+		if (ret) {
+			closure_put(&trans->ref);
+			goto out;
+		}
+
+		bch2_check_for_deadlock(trans, &i->buf);
+
+		i->iter = trans->locking_wait.task->pid;
+
+		closure_put(&trans->ref);
+
+		if (!seqmutex_relock(&c->btree_trans_lock, seq))
+			goto restart;
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+out:
+	if (i->buf.allocation_failure)
+		ret = -ENOMEM;
+
+	if (!ret)
+		ret = flush_buf(i);
+
+	return ret ?: i->ret;
+}
+
+static const struct file_operations btree_deadlock_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_deadlock_read,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
+		debugfs_remove_recursive(c->fs_debug_dir);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->fs_debug_dir))
+		return;
+
+	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
+			    c->btree_debug, &cached_btree_nodes_ops);
+
+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
+	debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_transactions_ops);
+#endif
+
+	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
+			    c->btree_debug, &journal_pins_ops);
+
+	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
+			    c, &lock_held_stats_op);
+
+	debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_deadlock_ops);
+
+	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
+	if (IS_ERR_OR_NULL(c->btree_debug_dir))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		debugfs_create_file(bch2_btree_id_str(bd->id),
+				    0400, c->btree_debug_dir, bd,
+				    &btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch2_btree_id_str(bd->id));
+
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch2_btree_id_str(bd->id));
+
+		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
+				    &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcachefs", NULL);
+	return ret;
+}
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
new file mode 100644
index 000000000000..2c37143b5fd1
--- /dev/null
+++ b/fs/bcachefs/debug.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
+				    const struct btree *);
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	if (bch2_verify_btree_ondisk)
+		__bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
new file mode 100644
index 000000000000..2bfff0da7000
--- /dev/null
+++ b/fs/bcachefs/dirent.c
@@ -0,0 +1,580 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+#include "subvolume.h"
+
+#include <linux/dcache.h>
+
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned bkey_u64s = bkey_val_u64s(d.k);
+	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+	return bkey_bytes -
+		offsetof(struct bch_dirent, d_name) -
+		trailing_nuls;
+}
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+			    const struct qstr *name)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = bch2_dirent_get_name(d);
+
+	return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr *r_name = _r;
+
+	return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	const struct qstr l_name = bch2_dirent_get_name(l);
+	const struct qstr r_name = bch2_dirent_get_name(r);
+
+	return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+}
+
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL)
+		return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+	return true;
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+	.btree_id	= BTREE_ID_dirents,
+	.key_type	= KEY_TYPE_dirent,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+	.is_visible	= dirent_is_visible,
+};
+
+int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
+	int ret = 0;
+
+	bkey_fsck_err_on(!d_name.len, c, err,
+			 dirent_empty_name,
+			 "empty name");
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
+			 dirent_val_too_big,
+			 "value too big (%zu > %u)",
+			 bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+
+	/*
+	 * Check new keys don't exceed the max length
+	 * (older keys may be larger.)
+	 */
+	bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+			 dirent_name_too_long,
+			 "dirent name too big (%u > %u)",
+			 d_name.len, BCH_NAME_MAX);
+
+	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
+			 dirent_name_embedded_nul,
+			 "dirent has stray data after name's NUL");
+
+	bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
+			 dirent_name_dot_or_dotdot,
+			 "invalid name");
+
+	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
+			 dirent_name_has_slash,
+			 "name with /");
+
+	bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+			 le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
+			 dirent_to_itself,
+			 "dirent points to own directory");
+fsck_err:
+	return ret;
+}
+
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr d_name = bch2_dirent_get_name(d);
+
+	prt_printf(out, "%.*s -> %llu type %s",
+	       d_name.len,
+	       d_name.name,
+	       d.v->d_type != DT_SUBVOL
+	       ? le64_to_cpu(d.v->d_inum)
+	       : le32_to_cpu(d.v->d_child_subvol),
+	       bch2_d_type_str(d.v->d_type));
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+				subvol_inum dir, u8 type,
+				const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+	if (name->len > BCH_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	BUG_ON(u64s > U8_MAX);
+
+	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(dirent))
+		return dirent;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+
+	if (type != DT_SUBVOL) {
+		dirent->v.d_inum = cpu_to_le64(dst);
+	} else {
+		dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+		dirent->v.d_child_subvol = cpu_to_le32(dst);
+	}
+
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       offsetof(struct bch_dirent, d_name) -
+	       name->len);
+
+	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+		       const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       u64 *dir_offset, int flags)
+{
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			    dir, &dirent->k_i, flags);
+	*dir_offset = dirent->k.p.offset;
+
+	return ret;
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+			    struct bkey_s_c_dirent d, subvol_inum *target)
+{
+	struct bch_subvolume s;
+	int ret = 0;
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    le32_to_cpu(d.v->d_parent_subvol) != dir.subvol)
+		return 1;
+
+	if (likely(d.v->d_type != DT_SUBVOL)) {
+		target->subvol	= dir.subvol;
+		target->inum	= le64_to_cpu(d.v->d_inum);
+	} else {
+		target->subvol	= le32_to_cpu(d.v->d_child_subvol);
+
+		ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+		target->inum	= le64_to_cpu(s.inode);
+	}
+
+	return ret;
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+		subvol_inum src_dir, struct bch_hash_info *src_hash,
+		subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+		const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+		const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+		enum bch_rename_mode mode)
+{
+	struct btree_iter src_iter = { NULL };
+	struct btree_iter dst_iter = { NULL };
+	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos dst_pos =
+		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+	int ret = 0;
+
+	if (src_dir.subvol != dst_dir.subvol)
+		return -EXDEV;
+
+	memset(src_inum, 0, sizeof(*src_inum));
+	memset(dst_inum, 0, sizeof(*dst_inum));
+
+	/* Lookup src: */
+	ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+			       src_hash, src_dir, src_name,
+			       BTREE_ITER_INTENT);
+	if (ret)
+		goto out;
+
+	old_src = bch2_btree_iter_peek_slot(&src_iter);
+	ret = bkey_err(old_src);
+	if (ret)
+		goto out;
+
+	ret = bch2_dirent_read_target(trans, src_dir,
+			bkey_s_c_to_dirent(old_src), src_inum);
+	if (ret)
+		goto out;
+
+	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+		return -EOPNOTSUPP;
+
+
+	/* Lookup dst: */
+	if (mode == BCH_RENAME) {
+		/*
+		 * Note that we're _not_ checking if the target already exists -
+		 * we're relying on the VFS to do that check for us for
+		 * correctness:
+		 */
+		ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+				     dst_hash, dst_dir, dst_name);
+		if (ret)
+			goto out;
+	} else {
+		ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+				       dst_hash, dst_dir, dst_name,
+				       BTREE_ITER_INTENT);
+		if (ret)
+			goto out;
+
+		old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+		ret = bkey_err(old_dst);
+		if (ret)
+			goto out;
+
+		ret = bch2_dirent_read_target(trans, dst_dir,
+				bkey_s_c_to_dirent(old_dst), dst_inum);
+		if (ret)
+			goto out;
+
+		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+		if (dst_type == DT_SUBVOL)
+			return -EOPNOTSUPP;
+	}
+
+	if (mode != BCH_RENAME_EXCHANGE)
+		*src_offset = dst_iter.pos.offset;
+
+	/* Create new dst key: */
+	new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+	ret = PTR_ERR_OR_ZERO(new_dst);
+	if (ret)
+		goto out;
+
+	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+	new_dst->k.p = dst_iter.pos;
+
+	/* Create new src key: */
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		new_src->k.p = src_iter.pos;
+	} else {
+		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		ret = PTR_ERR_OR_ZERO(new_src);
+		if (ret)
+			goto out;
+
+		bkey_init(&new_src->k);
+		new_src->k.p = src_iter.pos;
+
+		if (bkey_le(dst_pos, src_iter.pos) &&
+		    bkey_lt(src_iter.pos, dst_iter.pos)) {
+			/*
+			 * We have a hash collision for the new dst key,
+			 * and new_src - the key we're deleting - is between
+			 * new_dst's hashed slot and the slot we're going to be
+			 * inserting it into - oops.  This will break the hash
+			 * table if we don't deal with it:
+			 */
+			if (mode == BCH_RENAME) {
+				/*
+				 * If we're not overwriting, we can just insert
+				 * new_dst at the src position:
+				 */
+				new_src = new_dst;
+				new_src->k.p = src_iter.pos;
+				goto out_set_src;
+			} else {
+				/* If we're overwriting, we can't insert new_dst
+				 * at a different slot because it has to
+				 * overwrite old_dst - just make sure to use a
+				 * whiteout when deleting src:
+				 */
+				new_src->k.type = KEY_TYPE_hash_whiteout;
+			}
+		} else {
+			/* Check if we need a whiteout to delete src: */
+			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+						       src_hash, &src_iter);
+			if (ret < 0)
+				goto out;
+
+			if (ret)
+				new_src->k.type = KEY_TYPE_hash_whiteout;
+		}
+	}
+
+	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+	if (ret)
+		goto out;
+out_set_src:
+
+	/*
+	 * If we're deleting a subvolume, we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot:
+	 */
+	if (src_type == DT_SUBVOL) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&src_iter);
+		if (ret)
+			goto out;
+
+		new_src->k.p = src_iter.pos;
+		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+	}
+
+	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+	if (ret)
+		goto out;
+
+	if (mode == BCH_RENAME_EXCHANGE)
+		*src_offset = new_src->k.p.offset;
+	*dst_offset = new_dst->k.p.offset;
+out:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       subvol_inum dir,
+			       const struct bch_hash_info *hash_info,
+			       const struct qstr *name, subvol_inum *inum,
+			       unsigned flags)
+{
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+			       hash_info, dir, name, flags);
+	if (ret)
+		return ret;
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	d = bkey_s_c_to_dirent(k);
+
+	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	if (ret > 0)
+		ret = -ENOENT;
+err:
+	if (ret)
+		bch2_trans_iter_exit(trans, iter);
+
+	return ret;
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name, subvol_inum *inum)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
+					  name, inum, 0);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(dir, 0, snapshot),
+			   POS(dir, U64_MAX), 0, k, ret)
+		if (k.k->type == KEY_TYPE_dirent) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
+{
+	u32 snapshot;
+
+	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
+		bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+}
+
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	subvol_inum target;
+	u32 snapshot;
+	struct bkey_buf sk;
+	struct qstr name;
+	int ret;
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
+			   SPOS(inum.inum, ctx->pos, snapshot),
+			   POS(inum.inum, U64_MAX), 0, k, ret) {
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
+		if (ret < 0)
+			break;
+		if (ret)
+			continue;
+
+		/* dir_emit() can fault and block: */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		dirent = bkey_i_to_s_c_dirent(sk.k);
+		bch2_trans_unlock(trans);
+
+		name = bch2_dirent_get_name(dirent);
+
+		ctx->pos = dirent.k->p.offset;
+		if (!dir_emit(ctx, name.name,
+			      name.len,
+			      target.inum,
+			      vfs_d_type(dirent.v->d_type)))
+			break;
+		ctx->pos = dirent.k->p.offset + 1;
+
+		/*
+		 * read_target looks up subvolumes, we can overflow paths if the
+		 * directory has many subvolumes in it
+		 */
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
new file mode 100644
index 000000000000..1e3431990abd
--- /dev/null
+++ b/fs/bcachefs/dirent.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
+			enum bkey_invalid_flags, struct printbuf *);
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+	.min_val_size	= 16,				\
+})
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+
+static inline unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+			    struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
+		       const struct bch_hash_info *, u8,
+		       const struct qstr *, u64, u64 *, int);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+	return type == DT_SUBVOL ? DT_DIR : type;
+}
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+		       subvol_inum, struct bch_hash_info *,
+		       subvol_inum, struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       const struct qstr *, subvol_inum *, u64 *,
+		       enum bch_rename_mode);
+
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+			       subvol_inum, const struct bch_hash_info *,
+			       const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+		       const struct bch_hash_info *,
+		       const struct qstr *, subvol_inum *);
+
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
new file mode 100644
index 000000000000..4d0cb0ccff32
--- /dev/null
+++ b/fs/bcachefs/disk_groups.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+					struct bch_sb_field *f,
+					struct printbuf *err)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	unsigned nr_groups = disk_groups_nr(groups);
+	unsigned i, len;
+	int ret = 0;
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = bch2_sb_member_get(sb, i);
+		unsigned group_id;
+
+		if (!BCH_MEMBER_GROUP(&m))
+			continue;
+
+		group_id = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (group_id >= nr_groups) {
+			prt_printf(err, "disk %u has invalid label %u (have %u)",
+				   i, group_id, nr_groups);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+
+		if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+			prt_printf(err, "disk %u has deleted label %u", i, group_id);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+	}
+
+	if (!nr_groups)
+		return 0;
+
+	for (i = 0; i < nr_groups; i++) {
+		g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			prt_printf(err, "label %u empty", i);
+			return -BCH_ERR_invalid_sb_disk_groups;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return -BCH_ERR_ENOMEM_disk_groups_validate;
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (g = sorted; g + 1 < sorted + nr_groups; g++)
+		if (!BCH_GROUP_DELETED(g) &&
+		    !group_cmp(&g[0], &g[1])) {
+			prt_printf(err, "duplicate label %llu.%.*s",
+			       BCH_GROUP_PARENT(g),
+			       (int) sizeof(g->label), g->label);
+			ret = -BCH_ERR_invalid_sb_disk_groups;
+			goto err;
+		}
+err:
+	kfree(sorted);
+	return ret;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_disk_groups_cpu *g;
+	struct bch_dev *ca;
+	int i;
+	unsigned iter;
+
+	out->atomic++;
+	rcu_read_lock();
+
+	g = rcu_dereference(c->disk_groups);
+	if (!g)
+		goto out;
+
+	for (i = 0; i < g->nr; i++) {
+		if (i)
+			prt_printf(out, " ");
+
+		if (g->entries[i].deleted) {
+			prt_printf(out, "[deleted]");
+			continue;
+		}
+
+		prt_printf(out, "[parent %d devs", g->entries[i].parent);
+		for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+			prt_printf(out, " %s", ca->name);
+		prt_printf(out, "]");
+	}
+
+out:
+	rcu_read_unlock();
+	out->atomic--;
+}
+
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			prt_printf(out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			prt_printf(out, "[deleted]");
+		else
+			prt_printf(out, "[parent %llu name %s]",
+			       BCH_GROUP_PARENT(g), g->label);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	groups		= bch2_sb_field_get(c->disk_sb.sb, disk_groups);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
+	if (!cpu_g)
+		return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+		memcpy(dst->label, src->label, sizeof(dst->label));
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+		struct bch_disk_group_cpu *dst;
+
+		if (!bch2_member_exists(&m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(&m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+	struct bch_devs_mask *devs;
+
+	rcu_read_lock();
+
+	switch (t.type) {
+	case TARGET_NULL:
+		devs = NULL;
+		break;
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		devs = ca ? &ca->self : NULL;
+		break;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		devs = g && t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	rcu_read_unlock();
+
+	return devs;
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = g && t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb->sb, disk_groups);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_field_resize(sb, disk_groups, u64s);
+		if (!groups)
+			return -BCH_ERR_ENOSPC_disk_label_add;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb->sb, disk_groups);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_field_get(sb->sb, disk_groups);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+	struct bch_disk_groups_cpu *groups;
+	struct bch_disk_group_cpu *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	out->atomic++;
+	rcu_read_lock();
+	groups = rcu_dereference(c->disk_groups);
+	if (!groups)
+		goto invalid;
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto invalid;
+
+		if (v >= groups->nr)
+			goto invalid;
+
+		g = groups->entries + v;
+
+		if (g->deleted)
+			goto invalid;
+
+		path[nr++] = v;
+
+		if (!g->parent)
+			break;
+
+		v = g->parent - 1;
+	}
+
+	while (nr) {
+		v = path[--nr];
+		g = groups->entries + v;
+
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+		if (nr)
+			prt_printf(out, ".");
+	}
+out:
+	rcu_read_unlock();
+	out->atomic--;
+	return;
+invalid:
+	prt_printf(out, "invalid label %u", v);
+	goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_field_get(sb, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		v = path[--nr];
+		g = groups->entries + v;
+
+		prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+		if (nr)
+			prt_printf(out, ".");
+	}
+	return;
+inval:
+	prt_printf(out, "invalid label %u", v);
+}
+
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int ret, v = -1;
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		return 0;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0)
+		return v;
+
+	ret = bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+	return 0;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = __bch2_dev_group_set(c, ca, name) ?:
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
+			  struct printbuf *err)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!val)
+		return -EINVAL;
+
+	if (!c)
+		return 0;
+
+	if (!strlen(val) || !strcmp(val, "none")) {
+		*res = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, val);
+	if (!IS_ERR(ca)) {
+		*res = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, val);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*res = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		prt_printf(out, "none");
+		break;
+	case TARGET_DEV: {
+		struct bch_dev *ca;
+
+		out->atomic++;
+		rcu_read_lock();
+		ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+
+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
+			prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+			percpu_ref_put(&ca->io_ref);
+		} else if (ca) {
+			prt_printf(out, "offline device %u", t.dev);
+		} else {
+			prt_printf(out, "invalid device %u", t.dev);
+		}
+
+		rcu_read_unlock();
+		out->atomic--;
+		break;
+	}
+	case TARGET_GROUP:
+		bch2_disk_path_to_text(out, c, t.group);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+	struct target t = target_decode(v);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		prt_printf(out, "none");
+		break;
+	case TARGET_DEV: {
+		struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+		if (bch2_dev_exists(sb, t.dev)) {
+			prt_printf(out, "Device ");
+			pr_uuid(out, m.uuid.b);
+			prt_printf(out, " (%u)", t.dev);
+		} else {
+			prt_printf(out, "Bad device %u", t.dev);
+		}
+		break;
+	}
+	case TARGET_GROUP:
+		bch2_disk_path_to_text_sb(out, sb, t.group);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+			     struct bch_fs *c,
+			     struct bch_sb *sb,
+			     u64 v)
+{
+	if (c)
+		bch2_target_to_text(out, c, v);
+	else
+		bch2_target_to_text_sb(out, sb, v);
+}
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
new file mode 100644
index 000000000000..441826fff224
--- /dev/null
+++ b/fs/bcachefs/disk_groups.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+#include "disk_groups_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+						  enum bch_data_type data_type,
+						  u16 target)
+{
+	struct bch_devs_mask devs = c->rw_devs[data_type];
+	const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+	if (t)
+		bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+	return devs;
+}
+
+static inline bool bch2_target_accepts_data(struct bch_fs *c,
+					    enum bch_data_type data_type,
+					    u16 target)
+{
+	struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target);
+	return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX);
+}
+
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+
+/* Exported for userspace bcachefs-tools: */
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+
+#define bch2_opt_target (struct bch_opt_fn) {		\
+	.parse		= bch2_opt_target_parse,	\
+	.to_text	= bch2_opt_target_to_text,	\
+}
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h
new file mode 100644
index 000000000000..a54ef085b13d
--- /dev/null
+++ b/fs/bcachefs/disk_groups_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+	bool				deleted;
+	u16				parent;
+	u8				label[BCH_SB_LABEL_SIZE];
+	struct bch_devs_mask		devs;
+};
+
+struct bch_disk_groups_cpu {
+	struct rcu_head			rcu;
+	unsigned			nr;
+	struct bch_disk_group_cpu	entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
new file mode 100644
index 000000000000..2a77de18c004
--- /dev/null
+++ b/fs/bcachefs/ec.c
@@ -0,0 +1,1981 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+			size_t size, void **data)
+{
+	unsigned i = 2, nr;
+
+	BUG_ON(failed_idx >= disks);
+
+	swap(data[0], data[failed_idx]);
+	memcpy(data[0], data[1], size);
+
+	while (i < disks) {
+		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+		xor_blocks(nr, size, data[0], data + i);
+		i += nr;
+	}
+
+	swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+	if (np >= 1)
+		raid5_recov(nd + np, nd, size, v);
+	if (np >= 2)
+		raid6_call.gen_syndrome(nd + np, size, v);
+	BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	switch (nr) {
+	case 0:
+		break;
+	case 1:
+		if (ir[0] < nd + 1)
+			raid5_recov(nd + 1, ir[0], size, v);
+		else
+			raid6_call.gen_syndrome(nd + np, size, v);
+		break;
+	case 2:
+		if (ir[1] < nd) {
+			/* data+data failure. */
+			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+		} else if (ir[0] < nd) {
+			/* data + p/q failure */
+
+			if (ir[1] == nd) /* data + p failure */
+				raid6_datap_recov(nd + np, size, ir[0], v);
+			else { /* data + q failure */
+				raid5_recov(nd + 1, ir[0], size, v);
+				raid6_call.gen_syndrome(nd + np, size, v);
+			}
+		} else {
+			raid_gen(nd, np, size, v);
+		}
+		break;
+	default:
+		BUG();
+	}
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+	struct bch_dev		*ca;
+	struct ec_stripe_buf	*buf;
+	size_t			idx;
+	struct bio		bio;
+};
+
+/* Stripes btree keys: */
+
+int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
+			enum bkey_invalid_flags flags,
+			struct printbuf *err)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+			 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
+			 stripe_pos_bad,
+			 "stripe at bad pos");
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
+			 stripe_val_size_bad,
+			 "incorrect value size (%zu < %u)",
+			 bkey_val_u64s(k.k), stripe_val_u64s(s));
+
+	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+	return ret;
+}
+
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+			 struct bkey_s_c k)
+{
+	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+	       s->algorithm,
+	       le16_to_cpu(s->sectors),
+	       nr_data,
+	       s->nr_redundant,
+	       s->csum_type,
+	       1U << s->csum_granularity_bits);
+
+	for (i = 0; i < s->nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = s->ptrs + i;
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		u32 offset;
+		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+		prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
+		if (i < nr_data)
+			prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+		prt_printf(out, " gen %u", ptr->gen);
+		if (ptr_stale(ca, ptr))
+			prt_printf(out, " stale");
+	}
+}
+
+/* returns blocknr in stripe that we matched: */
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+						struct bkey_s_c k, unsigned *block)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		for (i = 0; i < nr_data; i++)
+			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+						      le16_to_cpu(s->sectors))) {
+				*block = i;
+				return ptr;
+			}
+
+	return NULL;
+}
+
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+
+		extent_for_each_entry(e, entry)
+			if (extent_entry_type(entry) ==
+			    BCH_EXTENT_ENTRY_stripe_ptr &&
+			    entry->stripe_ptr.idx == idx)
+				return true;
+
+		break;
+	}
+	}
+
+	return false;
+}
+
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+	if (buf->key.k.type == KEY_TYPE_stripe) {
+		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
+		unsigned i;
+
+		for (i = 0; i < s->v.nr_blocks; i++) {
+			kvpfree(buf->data[i], buf->size << 9);
+			buf->data[i] = NULL;
+		}
+	}
+}
+
+/* XXX: this is a non-mempoolified memory allocation: */
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+			      unsigned offset, unsigned size)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1U << v->csum_granularity_bits;
+	unsigned end = offset + size;
+	unsigned i;
+
+	BUG_ON(end > le16_to_cpu(v->sectors));
+
+	offset	= round_down(offset, csum_granularity);
+	end	= min_t(unsigned, le16_to_cpu(v->sectors),
+			round_up(end, csum_granularity));
+
+	buf->offset	= offset;
+	buf->size	= end - offset;
+
+	memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		if (!buf->data[i])
+			goto err;
+	}
+
+	return 0;
+err:
+	ec_stripe_buf_exit(buf);
+	return -BCH_ERR_ENOMEM_stripe_buf;
+}
+
+/* Checksumming: */
+
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+					 unsigned block, unsigned offset)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned end = buf->offset + buf->size;
+	unsigned len = min(csum_granularity, end - offset);
+
+	BUG_ON(offset >= end);
+	BUG_ON(offset <  buf->offset);
+	BUG_ON(offset & (csum_granularity - 1));
+	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+	       (len & (csum_granularity - 1)));
+
+	return bch2_checksum(NULL, v->csum_type,
+			     null_nonce(),
+			     buf->data[block] + ((offset - buf->offset) << 9),
+			     len << 9);
+}
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+	if (!v->csum_type)
+		return;
+
+	BUG_ON(buf->offset);
+	BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+	for (i = 0; i < v->nr_blocks; i++)
+		for (j = 0; j < csums_per_device; j++)
+			stripe_csum_set(v, i, j,
+				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned csum_granularity = 1 << v->csum_granularity_bits;
+	unsigned i;
+
+	if (!v->csum_type)
+		return;
+
+	for (i = 0; i < v->nr_blocks; i++) {
+		unsigned offset = buf->offset;
+		unsigned end = buf->offset + buf->size;
+
+		if (!test_bit(i, buf->valid))
+			continue;
+
+		while (offset < end) {
+			unsigned j = offset >> v->csum_granularity_bits;
+			unsigned len = min(csum_granularity, end - offset);
+			struct bch_csum want = stripe_csum_get(v, i, j);
+			struct bch_csum got = ec_block_checksum(buf, i, offset);
+
+			if (bch2_crc_cmp(want, got)) {
+				struct printbuf err = PRINTBUF;
+				struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+				prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+					   want.hi, want.lo,
+					   got.hi, got.lo,
+					   bch2_csum_types[v->csum_type]);
+				prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
+				bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+				bch_err_ratelimited(ca, "%s", err.buf);
+				printbuf_exit(&err);
+
+				clear_bit(i, buf->valid);
+
+				bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+				break;
+			}
+
+			offset += len;
+		}
+	}
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+
+	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
+	unsigned nr_data = v->nr_blocks - v->nr_redundant;
+	unsigned bytes = buf->size << 9;
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		return -1;
+	}
+
+	for (i = 0; i < nr_data; i++)
+		if (!test_bit(i, buf->valid))
+			failed[nr_failed++] = i;
+
+	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+	return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
+	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
+	struct bch_dev *ca = ec_bio->ca;
+	struct closure *cl = bio->bi_private;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca,
+			       bio_data_dir(bio)
+			       ? BCH_MEMBER_ERROR_write
+			       : BCH_MEMBER_ERROR_read,
+			       "erasure coding %s error: %s",
+			       bio_data_dir(bio) ? "write" : "read",
+			       bch2_blk_status_to_str(bio->bi_status)))
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(ca->fs,
+				    "error %s stripe: stale pointer after io",
+				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
+
+	bio_put(&ec_bio->bio);
+	percpu_ref_put(&ca->io_ref);
+	closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+			blk_opf_t opf, unsigned idx, struct closure *cl)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
+	unsigned offset = 0, bytes = buf->size << 9;
+	struct bch_extent_ptr *ptr = &v->ptrs[idx];
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
+		? BCH_DATA_user
+		: BCH_DATA_parity;
+	int rw = op_is_write(opf);
+
+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(c,
+				    "error %s stripe: stale pointer",
+				    rw == READ ? "reading from" : "writing to");
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	if (!bch2_dev_get_ioref(ca, rw)) {
+		clear_bit(idx, buf->valid);
+		return;
+	}
+
+	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
+
+	while (offset < bytes) {
+		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
+					   DIV_ROUND_UP(bytes, PAGE_SIZE));
+		unsigned b = min_t(size_t, bytes - offset,
+				   nr_iovecs << PAGE_SHIFT);
+		struct ec_bio *ec_bio;
+
+		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
+						       nr_iovecs,
+						       opf,
+						       GFP_KERNEL,
+						       &c->ec_bioset),
+				      struct ec_bio, bio);
+
+		ec_bio->ca			= ca;
+		ec_bio->buf			= buf;
+		ec_bio->idx			= idx;
+
+		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
+		ec_bio->bio.bi_end_io		= ec_block_endio;
+		ec_bio->bio.bi_private		= cl;
+
+		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
+
+		closure_get(cl);
+		percpu_ref_get(&ca->io_ref);
+
+		submit_bio(&ec_bio->bio);
+
+		offset += b;
+	}
+
+	percpu_ref_put(&ca->io_ref);
+}
+
+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
+				struct ec_stripe_buf *stripe)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       POS(0, idx), BTREE_ITER_SLOTS);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+	if (k.k->type != KEY_TYPE_stripe) {
+		ret = -ENOENT;
+		goto err;
+	}
+	bkey_reassemble(&stripe->key, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_buf *buf;
+	struct closure cl;
+	struct bch_stripe *v;
+	unsigned i, offset;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	BUG_ON(!rbio->pick.has_ec);
+
+	buf = kzalloc(sizeof(*buf), GFP_NOFS);
+	if (!buf)
+		return -BCH_ERR_ENOMEM_ec_read_extent;
+
+	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
+	if (ret) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: error %i looking up stripe", ret);
+		kfree(buf);
+		return -EIO;
+	}
+
+	v = &bkey_i_to_stripe(&buf->key)->v;
+
+	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: pointer doesn't match stripe");
+		ret = -EIO;
+		goto err;
+	}
+
+	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: read is bigger than stripe");
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+	if (ret)
+		goto err;
+
+	for (i = 0; i < v->nr_blocks; i++)
+		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+
+	closure_sync(&cl);
+
+	if (ec_nr_failed(buf) > v->nr_redundant) {
+		bch_err_ratelimited(c,
+			"error doing reconstruct read: unable to read enough blocks");
+		ret = -EIO;
+		goto err;
+	}
+
+	ec_validate_checksums(c, buf);
+
+	ret = ec_do_recov(c, buf);
+	if (ret)
+		goto err;
+
+	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
+err:
+	ec_stripe_buf_exit(buf);
+	kfree(buf);
+	return ret;
+}
+
+/* stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+	ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+	if (idx >= h->size) {
+		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+		mutex_lock(&c->ec_stripes_heap_lock);
+		if (n.size > h->size) {
+			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+			n.used = h->used;
+			swap(*h, n);
+		}
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		free_heap(&n);
+	}
+
+	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
+	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
+		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+
+	return 0;
+}
+
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
+			       struct btree_iter *iter)
+{
+	return allocate_dropping_locks_errcode(trans,
+			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
+}
+
+/*
+ * Hash table of open stripes:
+ * Stripes that are being created or modified are kept in a hash table, so that
+ * stripe deletion can skip them.
+ */
+
+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+	struct ec_stripe_new *s;
+
+	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
+		if (s->idx == idx)
+			return true;
+	return false;
+}
+
+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
+{
+	bool ret = false;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = __bch2_stripe_is_open(c, idx);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static bool bch2_try_open_stripe(struct bch_fs *c,
+				 struct ec_stripe_new *s,
+				 u64 idx)
+{
+	bool ret;
+
+	spin_lock(&c->ec_stripes_new_lock);
+	ret = !__bch2_stripe_is_open(c, idx);
+	if (ret) {
+		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
+
+		s->idx = idx;
+		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
+	}
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	return ret;
+}
+
+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	BUG_ON(!s->idx);
+
+	spin_lock(&c->ec_stripes_new_lock);
+	hlist_del_init(&s->hash);
+	spin_unlock(&c->ec_stripes_new_lock);
+
+	s->idx = 0;
+}
+
+/* Heap of all existing stripes, ordered by blocks_nonempty */
+
+static u64 stripe_idx_to_delete(struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+
+	lockdep_assert_held(&c->ec_stripes_heap_lock);
+
+	if (h->used &&
+	    h->data[0].blocks_nonempty == 0 &&
+	    !bch2_stripe_is_open(c, h->data[0].idx))
+		return h->data[0].idx;
+
+	return 0;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+				      struct ec_stripe_heap_entry l,
+				      struct ec_stripe_heap_entry r)
+{
+	return ((l.blocks_nonempty > r.blocks_nonempty) -
+		(l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+						   size_t i)
+{
+	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+	BUG_ON(m->heap_idx >= h->used);
+	BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+			   struct stripe *m, size_t idx)
+{
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	heap_del(&c->ec_stripes_heap, m->heap_idx,
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	mutex_lock(&c->ec_stripes_heap_lock);
+	BUG_ON(heap_full(&c->ec_stripes_heap));
+
+	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+			.idx = idx,
+			.blocks_nonempty = m->blocks_nonempty,
+		}),
+		 ec_stripes_heap_cmp,
+		 ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+			      struct stripe *m, size_t idx)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	bool do_deletes;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	heap_verify_backpointer(c, idx);
+
+	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
+
+	i = m->heap_idx;
+	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
+		     ec_stripes_heap_set_backpointer);
+	heap_sift_down(h, i, ec_stripes_heap_cmp,
+		       ec_stripes_heap_set_backpointer);
+
+	heap_verify_backpointer(c, idx);
+
+	do_deletes = stripe_idx_to_delete(c) != 0;
+	mutex_unlock(&c->ec_stripes_heap_lock);
+
+	if (do_deletes)
+		bch2_do_stripe_deletes(c);
+}
+
+/* stripe deletion */
+
+static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_stripe s;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
+			       BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_stripe) {
+		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	s = bkey_s_c_to_stripe(k);
+	for (unsigned i = 0; i < s.v->nr_blocks; i++)
+		if (stripe_blockcount_get(s.v, i)) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -EINVAL;
+			goto err;
+		}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, ec_stripe_delete_work);
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret;
+	u64 idx;
+
+	while (1) {
+		mutex_lock(&c->ec_stripes_heap_lock);
+		idx = stripe_idx_to_delete(c);
+		mutex_unlock(&c->ec_stripes_heap_lock);
+
+		if (!idx)
+			break;
+
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				ec_stripe_delete(trans, idx));
+		if (ret) {
+			bch_err_fn(c, ret);
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+void bch2_do_stripe_deletes(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+}
+
+/* stripe creation: */
+
+static int ec_stripe_key_update(struct btree_trans *trans,
+				struct bkey_i_stripe *new,
+				bool create)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+			       new->k.p, BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
+		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
+				     create ? "creating" : "updating",
+				     bch2_bkey_types[k.k->type]);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (k.k->type == KEY_TYPE_stripe) {
+		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
+		unsigned i;
+
+		if (old->nr_blocks != new->v.nr_blocks) {
+			bch_err(c, "error updating stripe: nr_blocks does not match");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		for (i = 0; i < new->v.nr_blocks; i++) {
+			unsigned v = stripe_blockcount_get(old, i);
+
+			BUG_ON(v &&
+			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
+				old->ptrs[i].gen != new->v.ptrs[i].gen ||
+				old->ptrs[i].offset != new->v.ptrs[i].offset));
+
+			stripe_blockcount_set(&new->v, i, v);
+		}
+	}
+
+	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_extent(struct btree_trans *trans,
+				   struct bpos bucket, u8 gen,
+				   struct ec_stripe_buf *s,
+				   struct bpos *bp_pos)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_fs *c = trans->c;
+	struct bch_backpointer bp;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_extent_ptr *ptr_c;
+	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+	struct bch_extent_stripe_ptr stripe_ptr;
+	struct bkey_i *n;
+	int ret, dev, block;
+
+	ret = bch2_get_next_backpointer(trans, bucket, gen,
+				bp_pos, &bp, BTREE_ITER_CACHED);
+	if (ret)
+		return ret;
+	if (bpos_eq(*bp_pos, SPOS_MAX))
+		return 0;
+
+	if (bp.level) {
+		struct printbuf buf = PRINTBUF;
+		struct btree_iter node_iter;
+		struct btree *b;
+
+		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
+		bch2_trans_iter_exit(trans, &node_iter);
+
+		if (!b)
+			return 0;
+
+		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
+		bch2_backpointer_to_text(&buf, &bp);
+
+		bch2_fs_inconsistent(c, "%s", buf.buf);
+		printbuf_exit(&buf);
+		return -EIO;
+	}
+
+	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k) {
+		/*
+		 * extent no longer exists - we could flush the btree
+		 * write buffer and retry to verify, but no need:
+		 */
+		return 0;
+	}
+
+	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
+		goto out;
+
+	ptr_c = bkey_matches_stripe(v, k, &block);
+	/*
+	 * It doesn't generally make sense to erasure code cached ptrs:
+	 * XXX: should we be incrementing a counter?
+	 */
+	if (!ptr_c || ptr_c->cached)
+		goto out;
+
+	dev = v->ptrs[block].dev;
+
+	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		goto out;
+
+	bkey_reassemble(n, k);
+
+	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
+	BUG_ON(!ec_ptr);
+
+	stripe_ptr = (struct bch_extent_stripe_ptr) {
+		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+		.block		= block,
+		.redundancy	= v->nr_redundant,
+		.idx		= s->key.k.p.offset,
+	};
+
+	__extent_entry_insert(n,
+			(union bch_extent_entry *) ec_ptr,
+			(union bch_extent_entry *) &stripe_ptr);
+
+	ret = bch2_trans_update(trans, &iter, n, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
+				   unsigned block)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	struct bch_extent_ptr bucket = v->ptrs[block];
+	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
+	struct bpos bp_pos = POS_MIN;
+	int ret = 0;
+
+	while (1) {
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL,
+			ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
+						s, &bp_pos));
+		if (ret)
+			break;
+		if (bkey_eq(bp_pos, POS_MAX))
+			break;
+
+		bp_pos = bpos_nosnap_successor(bp_pos);
+	}
+
+	return ret;
+}
+
+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret = 0;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_data; i++) {
+		ret = ec_stripe_update_bucket(trans, s, i);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
+				       struct ec_stripe_new *s,
+				       unsigned block,
+				       struct open_bucket *ob)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
+	int ret;
+
+	if (!bch2_dev_get_ioref(ca, WRITE)) {
+		s->err = -BCH_ERR_erofs_no_writes;
+		return;
+	}
+
+	memset(s->new_stripe.data[block] + (offset << 9),
+	       0,
+	       ob->sectors_free << 9);
+
+	ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
+			ob->bucket * ca->mi.bucket_size + offset,
+			ob->sectors_free,
+			GFP_KERNEL, 0);
+
+	percpu_ref_put(&ca->io_ref);
+
+	if (ret)
+		s->err = ret;
+}
+
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	if (s->idx)
+		bch2_stripe_close(c, s);
+	kfree(s);
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+	struct bch_fs *c = s->c;
+	struct open_bucket *ob;
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+	int ret;
+
+	BUG_ON(s->h->s == s);
+
+	closure_sync(&s->iodone);
+
+	if (!s->err) {
+		for (i = 0; i < nr_data; i++)
+			if (s->blocks[i]) {
+				ob = c->open_buckets + s->blocks[i];
+
+				if (ob->sectors_free)
+					zero_out_rest_of_ec_bucket(c, s, i, ob);
+			}
+	}
+
+	if (s->err) {
+		if (!bch2_err_matches(s->err, EROFS))
+			bch_err(c, "error creating stripe: error writing data buckets");
+		goto err;
+	}
+
+	if (s->have_existing_stripe) {
+		ec_validate_checksums(c, &s->existing_stripe);
+
+		if (ec_do_recov(c, &s->existing_stripe)) {
+			bch_err(c, "error creating stripe: error reading existing stripe");
+			goto err;
+		}
+
+		for (i = 0; i < nr_data; i++)
+			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
+				swap(s->new_stripe.data[i],
+				     s->existing_stripe.data[i]);
+
+		ec_stripe_buf_exit(&s->existing_stripe);
+	}
+
+	BUG_ON(!s->allocated);
+	BUG_ON(!s->idx);
+
+	ec_generate_ec(&s->new_stripe);
+
+	ec_generate_checksums(&s->new_stripe);
+
+	/* write p/q: */
+	for (i = nr_data; i < v->nr_blocks; i++)
+		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+	closure_sync(&s->iodone);
+
+	if (ec_nr_failed(&s->new_stripe)) {
+		bch_err(c, "error creating stripe: error writing redundancy buckets");
+		goto err;
+	}
+
+	ret = bch2_trans_do(c, &s->res, NULL,
+			    BTREE_INSERT_NOCHECK_RW|
+			    BTREE_INSERT_NOFAIL,
+			    ec_stripe_key_update(trans,
+					bkey_i_to_stripe(&s->new_stripe.key),
+					!s->have_existing_stripe));
+	if (ret) {
+		bch_err(c, "error creating stripe: error creating stripe key");
+		goto err;
+	}
+
+	ret = ec_stripe_update_extents(c, &s->new_stripe);
+	if (ret) {
+		bch_err_msg(c, ret, "creating stripe: error updating pointers");
+		goto err;
+	}
+err:
+	bch2_disk_reservation_put(c, &s->res);
+
+	for (i = 0; i < v->nr_blocks; i++)
+		if (s->blocks[i]) {
+			ob = c->open_buckets + s->blocks[i];
+
+			if (i < nr_data) {
+				ob->ec = NULL;
+				__bch2_open_bucket_put(c, ob);
+			} else {
+				bch2_open_bucket_put(c, ob);
+			}
+		}
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_del(&s->list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+	wake_up(&c->ec_stripe_new_wait);
+
+	ec_stripe_buf_exit(&s->existing_stripe);
+	ec_stripe_buf_exit(&s->new_stripe);
+	closure_debug_destroy(&s->iodone);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
+}
+
+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
+{
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
+		if (!atomic_read(&s->ref[STRIPE_REF_io]))
+			goto out;
+	s = NULL;
+out:
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return s;
+}
+
+static void ec_stripe_create_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work,
+		struct bch_fs, ec_stripe_create_work);
+	struct ec_stripe_new *s;
+
+	while ((s = get_pending_stripe(c)))
+		ec_stripe_create(s);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+void bch2_ec_do_stripe_creates(struct bch_fs *c)
+{
+	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+
+	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+}
+
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s = h->s;
+
+	BUG_ON(!s->allocated && !s->err);
+
+	h->s		= NULL;
+	s->pending	= true;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_add(&s->list, &c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	ec_stripe_new_put(c, s, STRIPE_REF_io);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct ec_stripe_new *s = ob->ec;
+
+	s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+	struct bch_dev *ca;
+	unsigned offset;
+
+	if (!ob)
+		return NULL;
+
+	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
+
+	ca	= bch_dev_bkey_exists(c, ob->dev);
+	offset	= ca->mi.bucket_size - ob->sectors_free;
+
+	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+	unsigned l = *((const unsigned *) _l);
+	unsigned r = *((const unsigned *) _r);
+
+	return cmp_int(l, r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+			       struct bch_devs_mask *devs)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+	struct {
+		unsigned nr, size;
+	} cur = { 0, 0 }, best = { 0, 0 };
+
+	for_each_member_device_rcu(ca, c, i, devs)
+		sizes[nr++] = ca->mi.bucket_size;
+
+	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+	for (i = 0; i < nr; i++) {
+		if (sizes[i] != cur.size) {
+			if (cur.nr > best.nr)
+				best = cur;
+
+			cur.nr = 0;
+			cur.size = sizes[i];
+		}
+
+		cur.nr++;
+	}
+
+	if (cur.nr > best.nr)
+		best = cur;
+
+	return best.size;
+}
+
+static bool may_create_new_stripe(struct bch_fs *c)
+{
+	return false;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+			       struct bkey_i *k,
+			       unsigned nr_data,
+			       unsigned nr_parity,
+			       unsigned stripe_size)
+{
+	struct bkey_i_stripe *s = bkey_stripe_init(k);
+	unsigned u64s;
+
+	s->v.sectors			= cpu_to_le16(stripe_size);
+	s->v.algorithm			= 0;
+	s->v.nr_blocks			= nr_data + nr_parity;
+	s->v.nr_redundant		= nr_parity;
+	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
+	s->v.csum_type			= BCH_CSUM_crc32c;
+	s->v.pad			= 0;
+
+	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+		BUG_ON(1 << s->v.csum_granularity_bits >=
+		       le16_to_cpu(s->v.sectors) ||
+		       s->v.csum_granularity_bits == U8_MAX);
+		s->v.csum_granularity_bits++;
+	}
+
+	set_bkey_val_u64s(&s->k, u64s);
+}
+
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	struct ec_stripe_new *s;
+
+	lockdep_assert_held(&h->lock);
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+
+	mutex_init(&s->lock);
+	closure_init(&s->iodone, NULL);
+	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+	atomic_set(&s->ref[STRIPE_REF_io], 1);
+	s->c		= c;
+	s->h		= h;
+	s->nr_data	= min_t(unsigned, h->nr_active_devs,
+				BCH_BKEY_PTRS_MAX) - h->redundancy;
+	s->nr_parity	= h->redundancy;
+
+	ec_stripe_key_init(c, &s->new_stripe.key,
+			   s->nr_data, s->nr_parity, h->blocksize);
+
+	h->s = s;
+	return 0;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+			 unsigned algo, unsigned redundancy,
+			 enum bch_watermark watermark)
+{
+	struct ec_stripe_head *h;
+	struct bch_dev *ca;
+	unsigned i;
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return NULL;
+
+	mutex_init(&h->lock);
+	BUG_ON(!mutex_trylock(&h->lock));
+
+	h->target	= target;
+	h->algo		= algo;
+	h->redundancy	= redundancy;
+	h->watermark	= watermark;
+
+	rcu_read_lock();
+	h->devs = target_rw_devs(c, BCH_DATA_user, target);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (!ca->mi.durability)
+			__clear_bit(i, h->devs.d);
+
+	h->blocksize = pick_blocksize(c, &h->devs);
+
+	for_each_member_device_rcu(ca, c, i, &h->devs)
+		if (ca->mi.bucket_size == h->blocksize)
+			h->nr_active_devs++;
+
+	rcu_read_unlock();
+
+	/*
+	 * If we only have redundancy + 1 devices, we're better off with just
+	 * replication:
+	 */
+	if (h->nr_active_devs < h->redundancy + 2)
+		bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+			h->nr_active_devs, h->redundancy + 2);
+
+	list_add(&h->list, &c->ec_stripe_head_list);
+	return h;
+}
+
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
+{
+	if (h->s &&
+	    h->s->allocated &&
+	    bitmap_weight(h->s->blocks_allocated,
+			  h->s->nr_data) == h->s->nr_data)
+		ec_stripe_set_pending(c, h);
+
+	mutex_unlock(&h->lock);
+}
+
+static struct ec_stripe_head *
+__bch2_ec_stripe_head_get(struct btree_trans *trans,
+			  unsigned target,
+			  unsigned algo,
+			  unsigned redundancy,
+			  enum bch_watermark watermark)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	int ret;
+
+	if (!redundancy)
+		return NULL;
+
+	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
+		goto found;
+	}
+
+	list_for_each_entry(h, &c->ec_stripe_head_list, list)
+		if (h->target		== target &&
+		    h->algo		== algo &&
+		    h->redundancy	== redundancy &&
+		    h->watermark	== watermark) {
+			ret = bch2_trans_mutex_lock(trans, &h->lock);
+			if (ret)
+				h = ERR_PTR(ret);
+			goto found;
+		}
+
+	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+found:
+	if (!IS_ERR_OR_NULL(h) &&
+	    h->nr_active_devs < h->redundancy + 2) {
+		mutex_unlock(&h->lock);
+		h = NULL;
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+	return h;
+}
+
+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
+				    enum bch_watermark watermark, struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_devs_mask devs = h->devs;
+	struct open_bucket *ob;
+	struct open_buckets buckets;
+	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
+	bool have_cache = true;
+	int ret = 0;
+
+	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
+	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
+
+	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
+		__clear_bit(v->ptrs[i].dev, devs.d);
+		if (i < h->s->nr_data)
+			nr_have_data++;
+		else
+			nr_have_parity++;
+	}
+
+	BUG_ON(nr_have_data	> h->s->nr_data);
+	BUG_ON(nr_have_parity	> h->s->nr_parity);
+
+	buckets.nr = 0;
+	if (nr_have_parity < h->s->nr_parity) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->parity_stripe,
+					    &devs,
+					    h->s->nr_parity,
+					    &nr_have_parity,
+					    &have_cache, 0,
+					    BCH_DATA_parity,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data + h->s->nr_parity,
+					       h->s->nr_data);
+			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+			h->s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	buckets.nr = 0;
+	if (nr_have_data < h->s->nr_data) {
+		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
+					    &h->block_stripe,
+					    &devs,
+					    h->s->nr_data,
+					    &nr_have_data,
+					    &have_cache, 0,
+					    BCH_DATA_user,
+					    watermark,
+					    cl);
+
+		open_bucket_for_each(c, &buckets, ob, i) {
+			j = find_next_zero_bit(h->s->blocks_gotten,
+					       h->s->nr_data, 0);
+			BUG_ON(j >= h->s->nr_data);
+
+			h->s->blocks[j] = buckets.v[i];
+			v->ptrs[j] = bch2_ob_ptr(c, ob);
+			__set_bit(j, h->s->blocks_gotten);
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* XXX: doesn't obey target: */
+static s64 get_existing_stripe(struct bch_fs *c,
+			       struct ec_stripe_head *head)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t heap_idx;
+	u64 stripe_idx;
+	s64 ret = -1;
+
+	if (may_create_new_stripe(c))
+		return -1;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+		/* No blocks worth reusing, stripe will just be deleted: */
+		if (!h->data[heap_idx].blocks_nonempty)
+			continue;
+
+		stripe_idx = h->data[heap_idx].idx;
+
+		m = genradix_ptr(&c->stripes, stripe_idx);
+
+		if (m->algorithm	== head->algo &&
+		    m->nr_redundant	== head->redundancy &&
+		    m->sectors		== head->blocksize &&
+		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
+		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
+			ret = stripe_idx;
+			break;
+		}
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+	return ret;
+}
+
+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
+	struct bch_stripe *existing_v;
+	unsigned i;
+	s64 idx;
+	int ret;
+
+	/*
+	 * If we can't allocate a new stripe, and there's no stripes with empty
+	 * blocks for us to reuse, that means we have to wait on copygc:
+	 */
+	idx = get_existing_stripe(c, h);
+	if (idx < 0)
+		return -BCH_ERR_stripe_alloc_blocked;
+
+	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
+		return ret;
+	}
+
+	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
+
+	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
+	h->s->nr_data = existing_v->nr_blocks -
+		existing_v->nr_redundant;
+
+	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		return ret;
+	}
+
+	BUG_ON(h->s->existing_stripe.size != h->blocksize);
+	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
+
+	/*
+	 * Free buckets we initially allocated - they might conflict with
+	 * blocks from the stripe we're reusing:
+	 */
+	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
+		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
+		h->s->blocks[i] = 0;
+	}
+	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
+	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
+
+	for (i = 0; i < existing_v->nr_blocks; i++) {
+		if (stripe_blockcount_get(existing_v, i)) {
+			__set_bit(i, h->s->blocks_gotten);
+			__set_bit(i, h->s->blocks_allocated);
+		}
+
+		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+	}
+
+	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
+	h->s->have_existing_stripe = true;
+
+	return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos min_pos = POS(0, 1);
+	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
+	int ret;
+
+	if (!h->s->res.sectors) {
+		ret = bch2_disk_reservation_get(c, &h->s->res,
+					h->blocksize,
+					h->s->nr_parity,
+					BCH_DISK_RESERVATION_NOFAIL);
+		if (ret)
+			return ret;
+	}
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
+			if (start_pos.offset) {
+				start_pos = min_pos;
+				bch2_btree_iter_set_pos(&iter, start_pos);
+				continue;
+			}
+
+			ret = -BCH_ERR_ENOSPC_stripe_create;
+			break;
+		}
+
+		if (bkey_deleted(k.k) &&
+		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
+			break;
+	}
+
+	c->ec_stripe_hint = iter.pos.offset;
+
+	if (ret)
+		goto err;
+
+	ret = ec_stripe_mem_alloc(trans, &iter);
+	if (ret) {
+		bch2_stripe_close(c, h->s);
+		goto err;
+	}
+
+	h->s->new_stripe.key.k.p = iter.pos;
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+err:
+	bch2_disk_reservation_put(c, &h->s->res);
+	goto out;
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
+					       unsigned target,
+					       unsigned algo,
+					       unsigned redundancy,
+					       enum bch_watermark watermark,
+					       struct closure *cl)
+{
+	struct bch_fs *c = trans->c;
+	struct ec_stripe_head *h;
+	bool waiting = false;
+	int ret;
+
+	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+	if (IS_ERR_OR_NULL(h))
+		return h;
+
+	if (!h->s) {
+		ret = ec_new_stripe_alloc(c, h);
+		if (ret) {
+			bch_err(c, "failed to allocate new stripe");
+			goto err;
+		}
+	}
+
+	if (h->s->allocated)
+		goto allocated;
+
+	if (h->s->have_existing_stripe)
+		goto alloc_existing;
+
+	/* First, try to allocate a full stripe: */
+	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
+		__bch2_ec_stripe_head_reserve(trans, h);
+	if (!ret)
+		goto allocate_buf;
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    bch2_err_matches(ret, ENOMEM))
+		goto err;
+
+	/*
+	 * Not enough buckets available for a full stripe: we must reuse an
+	 * existing stripe:
+	 */
+	while (1) {
+		ret = __bch2_ec_stripe_head_reuse(trans, h);
+		if (!ret)
+			break;
+		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
+			goto err;
+
+		if (watermark == BCH_WATERMARK_copygc) {
+			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
+				__bch2_ec_stripe_head_reserve(trans, h);
+			if (ret)
+				goto err;
+			goto allocate_buf;
+		}
+
+		/* XXX freelist_wait? */
+		closure_wait(&c->freelist_wait, cl);
+		waiting = true;
+	}
+
+	if (waiting)
+		closure_wake_up(&c->freelist_wait);
+alloc_existing:
+	/*
+	 * Retry allocating buckets, with the watermark for this
+	 * particular write:
+	 */
+	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
+	if (ret)
+		goto err;
+
+allocate_buf:
+	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
+	if (ret)
+		goto err;
+
+	h->s->allocated = true;
+allocated:
+	BUG_ON(!h->s->idx);
+	BUG_ON(!h->s->new_stripe.data[0]);
+	BUG_ON(trans->restarted);
+	return h;
+err:
+	bch2_ec_stripe_head_put(c, h);
+	return ERR_PTR(ret);
+}
+
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct ec_stripe_head *h;
+	struct open_bucket *ob;
+	unsigned i;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		mutex_lock(&h->lock);
+		if (!h->s)
+			goto unlock;
+
+		if (!ca)
+			goto found;
+
+		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
+			if (!h->s->blocks[i])
+				continue;
+
+			ob = c->open_buckets + h->s->blocks[i];
+			if (ob->dev == ca->dev_idx)
+				goto found;
+		}
+		goto unlock;
+found:
+		h->s->err = -BCH_ERR_erofs_no_writes;
+		ec_stripe_set_pending(c, h);
+unlock:
+		mutex_unlock(&h->lock);
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+	__bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+	__bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+	bool ret;
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	ret = list_empty(&c->ec_stripe_new_list);
+	mutex_unlock(&c->ec_stripe_new_lock);
+
+	return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
+int bch2_stripes_read(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	const struct bch_stripe *s;
+	struct stripe *m;
+	unsigned i;
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+			   BTREE_ITER_PREFETCH, k, ret) {
+		if (k.k->type != KEY_TYPE_stripe)
+			continue;
+
+		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+		if (ret)
+			break;
+
+		s = bkey_s_c_to_stripe(k).v;
+
+		m = genradix_ptr(&c->stripes, k.k->p.offset);
+		m->sectors	= le16_to_cpu(s->sectors);
+		m->algorithm	= s->algorithm;
+		m->nr_blocks	= s->nr_blocks;
+		m->nr_redundant	= s->nr_redundant;
+		m->blocks_nonempty = 0;
+
+		for (i = 0; i < s->nr_blocks; i++)
+			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+
+		bch2_stripes_heap_insert(c, m, k.k->p.offset);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return ret;
+}
+
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	ec_stripes_heap *h = &c->ec_stripes_heap;
+	struct stripe *m;
+	size_t i;
+
+	mutex_lock(&c->ec_stripes_heap_lock);
+	for (i = 0; i < min_t(size_t, h->used, 50); i++) {
+		m = genradix_ptr(&c->stripes, h->data[i].idx);
+
+		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
+		       h->data[i].blocks_nonempty,
+		       m->nr_blocks - m->nr_redundant,
+		       m->nr_redundant);
+		if (bch2_stripe_is_open(c, h->data[i].idx))
+			prt_str(out, " open");
+		prt_newline(out);
+	}
+	mutex_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	struct ec_stripe_new *s;
+
+	mutex_lock(&c->ec_stripe_head_lock);
+	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
+		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+		       h->target, h->algo, h->redundancy,
+		       bch2_watermarks[h->watermark]);
+
+		if (h->s)
+			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
+			       h->s->idx, h->s->nr_data, h->s->nr_parity,
+			       bitmap_weight(h->s->blocks_allocated,
+					     h->s->nr_data));
+	}
+	mutex_unlock(&c->ec_stripe_head_lock);
+
+	prt_printf(out, "in flight:\n");
+
+	mutex_lock(&c->ec_stripe_new_lock);
+	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
+		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
+			   s->idx, s->nr_data, s->nr_parity,
+			   atomic_read(&s->ref[STRIPE_REF_io]),
+			   atomic_read(&s->ref[STRIPE_REF_stripe]),
+			   bch2_watermarks[s->h->watermark]);
+	}
+	mutex_unlock(&c->ec_stripe_new_lock);
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+	struct ec_stripe_head *h;
+	unsigned i;
+
+	while (1) {
+		mutex_lock(&c->ec_stripe_head_lock);
+		h = list_first_entry_or_null(&c->ec_stripe_head_list,
+					     struct ec_stripe_head, list);
+		if (h)
+			list_del(&h->list);
+		mutex_unlock(&c->ec_stripe_head_lock);
+		if (!h)
+			break;
+
+		if (h->s) {
+			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
+				BUG_ON(h->s->blocks[i]);
+
+			kfree(h->s);
+		}
+		kfree(h);
+	}
+
+	BUG_ON(!list_empty(&c->ec_stripe_new_list));
+
+	free_heap(&c->ec_stripes_heap);
+	genradix_free(&c->stripes);
+	bioset_exit(&c->ec_bioset);
+}
+
+void bch2_fs_ec_init_early(struct bch_fs *c)
+{
+	spin_lock_init(&c->ec_stripes_new_lock);
+	mutex_init(&c->ec_stripes_heap_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_head_list);
+	mutex_init(&c->ec_stripe_head_lock);
+
+	INIT_LIST_HEAD(&c->ec_stripe_new_list);
+	mutex_init(&c->ec_stripe_new_lock);
+	init_waitqueue_head(&c->ec_stripe_new_wait);
+
+	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
+	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+			   BIOSET_NEED_BVECS);
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
new file mode 100644
index 000000000000..7d0237c9819f
--- /dev/null
+++ b/fs/bcachefs/ec.h
@@ -0,0 +1,260 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
+			enum bkey_invalid_flags, struct printbuf *);
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
+			 struct bkey_s_c);
+
+#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
+	.key_invalid	= bch2_stripe_invalid,		\
+	.val_to_text	= bch2_stripe_to_text,		\
+	.swab		= bch2_ptr_swab,		\
+	.trans_trigger	= bch2_trans_mark_stripe,	\
+	.atomic_trigger	= bch2_mark_stripe,		\
+	.min_val_size	= 8,				\
+})
+
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+			    1 << s->csum_granularity_bits);
+}
+
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
+					  unsigned dev, unsigned csum_idx)
+{
+	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+
+	return sizeof(struct bch_stripe) +
+		sizeof(struct bch_extent_ptr) * s->nr_blocks +
+		(dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
+						unsigned idx)
+{
+	return stripe_csum_offset(s, s->nr_blocks, 0) +
+		sizeof(u16) * idx;
+}
+
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
+					     unsigned idx)
+{
+	return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
+}
+
+static inline void stripe_blockcount_set(struct bch_stripe *s,
+					 unsigned idx, unsigned v)
+{
+	__le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
+
+	*p = cpu_to_le16(v);
+}
+
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+	return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
+			    sizeof(u64));
+}
+
+static inline void *stripe_csum(struct bch_stripe *s,
+				unsigned block, unsigned csum_idx)
+{
+	EBUG_ON(block >= s->nr_blocks);
+	EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+	return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx)
+{
+	struct bch_csum csum = { 0 };
+
+	memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+	return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+				   unsigned block, unsigned csum_idx,
+				   struct bch_csum csum)
+{
+	memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+					     const struct bch_extent_ptr *data_ptr,
+					     unsigned sectors)
+{
+	return  data_ptr->dev    == stripe_ptr->dev &&
+		data_ptr->gen    == stripe_ptr->gen &&
+		data_ptr->offset >= stripe_ptr->offset &&
+		data_ptr->offset  < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+					   struct extent_ptr_decoded p)
+{
+	unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+					 le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
+					     struct extent_ptr_decoded p)
+{
+	unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+	BUG_ON(!p.has_ec);
+
+	if (p.ec.block >= nr_data)
+		return false;
+
+	return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+					 m->sectors);
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+	/* might not be buffering the entire stripe: */
+	unsigned		offset;
+	unsigned		size;
+	unsigned long		valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+
+	void			*data[BCH_BKEY_PTRS_MAX];
+
+	__BKEY_PADDED(key, 255);
+};
+
+struct ec_stripe_head;
+
+enum ec_stripe_ref {
+	STRIPE_REF_io,
+	STRIPE_REF_stripe,
+	STRIPE_REF_NR
+};
+
+struct ec_stripe_new {
+	struct bch_fs		*c;
+	struct ec_stripe_head	*h;
+	struct mutex		lock;
+	struct list_head	list;
+
+	struct hlist_node	hash;
+	u64			idx;
+
+	struct closure		iodone;
+
+	atomic_t		ref[STRIPE_REF_NR];
+
+	int			err;
+
+	u8			nr_data;
+	u8			nr_parity;
+	bool			allocated;
+	bool			pending;
+	bool			have_existing_stripe;
+
+	unsigned long		blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	unsigned long		blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
+	struct disk_reservation	res;
+
+	struct ec_stripe_buf	new_stripe;
+	struct ec_stripe_buf	existing_stripe;
+};
+
+struct ec_stripe_head {
+	struct list_head	list;
+	struct mutex		lock;
+
+	unsigned		target;
+	unsigned		algo;
+	unsigned		redundancy;
+	enum bch_watermark	watermark;
+
+	struct bch_devs_mask	devs;
+	unsigned		nr_active_devs;
+
+	unsigned		blocksize;
+
+	struct dev_stripe_state	block_stripe;
+	struct dev_stripe_state	parity_stripe;
+
+	struct ec_stripe_new	*s;
+};
+
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+			unsigned, unsigned, unsigned,
+			enum bch_watermark, struct closure *);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+
+void bch2_do_stripe_deletes(struct bch_fs *);
+void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
+
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	atomic_inc(&s->ref[ref]);
+}
+
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+				     enum ec_stripe_ref ref)
+{
+	BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+	if (atomic_dec_and_test(&s->ref[ref]))
+		switch (ref) {
+		case STRIPE_REF_stripe:
+			bch2_ec_stripe_new_free(c, s);
+			break;
+		case STRIPE_REF_io:
+			bch2_ec_do_stripe_creates(c);
+			break;
+		default:
+			BUG();
+		}
+}
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
+
+int bch2_stripes_read(struct bch_fs *);
+
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+void bch2_fs_ec_init_early(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
new file mode 100644
index 000000000000..e2b02a82de32
--- /dev/null
+++ b/fs/bcachefs/ec_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_replicas_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[BCH_BKEY_PTRS_MAX];
+};
+
+struct stripe {
+	size_t			heap_idx;
+	u16			sectors;
+	u8			algorithm;
+	u8			nr_blocks;
+	u8			nr_redundant;
+	u8			blocks_nonempty;
+};
+
+struct gc_stripe {
+	u16			sectors;
+
+	u8			nr_blocks;
+	u8			nr_redundant;
+
+	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
+	u16			block_sectors[BCH_BKEY_PTRS_MAX];
+	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
+
+	struct bch_replicas_padded r;
+};
+
+struct ec_stripe_heap_entry {
+	size_t			idx;
+	unsigned		blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
new file mode 100644
index 000000000000..d260ff9bbfeb
--- /dev/null
+++ b/fs/bcachefs/errcode.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "errcode.h"
+
+#include <linux/errname.h>
+
+static const char * const bch2_errcode_strs[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err,
+	BCH_ERRCODES()
+#undef x
+	NULL
+};
+
+static unsigned bch2_errcode_parents[] = {
+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
+	BCH_ERRCODES()
+#undef x
+};
+
+const char *bch2_err_str(int err)
+{
+	const char *errstr;
+
+	err = abs(err);
+
+	BUG_ON(err >= BCH_ERR_MAX);
+
+	if (err >= BCH_ERR_START)
+		errstr = bch2_errcode_strs[err - BCH_ERR_START];
+	else if (err)
+		errstr = errname(err);
+	else
+		errstr = "(No error)";
+	return errstr ?: "(Invalid error)";
+}
+
+bool __bch2_err_matches(int err, int class)
+{
+	err	= abs(err);
+	class	= abs(class);
+
+	BUG_ON(err	>= BCH_ERR_MAX);
+	BUG_ON(class	>= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && err != class)
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return err == class;
+}
+
+int __bch2_err_class(int err)
+{
+	err = -err;
+	BUG_ON((unsigned) err >= BCH_ERR_MAX);
+
+	while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
+		err = bch2_errcode_parents[err - BCH_ERR_START];
+
+	return -err;
+}
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
new file mode 100644
index 000000000000..9ce29681eec9
--- /dev/null
+++ b/fs/bcachefs/errcode.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+#define BCH_ERRCODES()								\
+	x(ERANGE,			ERANGE_option_too_small)		\
+	x(ERANGE,			ERANGE_option_too_big)			\
+	x(ENOMEM,			ENOMEM_stripe_buf)			\
+	x(ENOMEM,			ENOMEM_replicas_table)			\
+	x(ENOMEM,			ENOMEM_cpu_replicas)			\
+	x(ENOMEM,			ENOMEM_replicas_gc)			\
+	x(ENOMEM,			ENOMEM_disk_groups_validate)		\
+	x(ENOMEM,			ENOMEM_disk_groups_to_cpu)		\
+	x(ENOMEM,			ENOMEM_mark_snapshot)			\
+	x(ENOMEM,			ENOMEM_mark_stripe)			\
+	x(ENOMEM,			ENOMEM_mark_stripe_ptr)			\
+	x(ENOMEM,			ENOMEM_btree_key_cache_create)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_fill)		\
+	x(ENOMEM,			ENOMEM_btree_key_cache_insert)		\
+	x(ENOMEM,			ENOMEM_trans_kmalloc)			\
+	x(ENOMEM,			ENOMEM_trans_log_msg)			\
+	x(ENOMEM,			ENOMEM_do_encrypt)			\
+	x(ENOMEM,			ENOMEM_ec_read_extent)			\
+	x(ENOMEM,			ENOMEM_ec_stripe_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_ec_new_stripe_alloc)		\
+	x(ENOMEM,			ENOMEM_fs_btree_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_key_cache_init)		\
+	x(ENOMEM,			ENOMEM_fs_counters_init)		\
+	x(ENOMEM,			ENOMEM_fs_btree_write_buffer_init)	\
+	x(ENOMEM,			ENOMEM_io_clock_init)			\
+	x(ENOMEM,			ENOMEM_blacklist_table_init)		\
+	x(ENOMEM,			ENOMEM_sb_realloc_injected)		\
+	x(ENOMEM,			ENOMEM_sb_bio_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_buf_realloc)			\
+	x(ENOMEM,			ENOMEM_sb_journal_validate)		\
+	x(ENOMEM,			ENOMEM_sb_journal_v2_validate)		\
+	x(ENOMEM,			ENOMEM_journal_entry_add)		\
+	x(ENOMEM,			ENOMEM_journal_read_buf_realloc)	\
+	x(ENOMEM,			ENOMEM_btree_interior_update_worker_init)\
+	x(ENOMEM,			ENOMEM_btree_interior_update_pool_init)	\
+	x(ENOMEM,			ENOMEM_bio_read_init)			\
+	x(ENOMEM,			ENOMEM_bio_read_split_init)		\
+	x(ENOMEM,			ENOMEM_bio_write_init)			\
+	x(ENOMEM,			ENOMEM_bio_bounce_pages_init)		\
+	x(ENOMEM,			ENOMEM_writepage_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_read_bioset_init)		\
+	x(ENOMEM,			ENOMEM_dio_write_bioset_init)		\
+	x(ENOMEM,			ENOMEM_nocow_flush_bioset_init)		\
+	x(ENOMEM,			ENOMEM_promote_table_init)		\
+	x(ENOMEM,			ENOMEM_compression_bounce_read_init)	\
+	x(ENOMEM,			ENOMEM_compression_bounce_write_init)	\
+	x(ENOMEM,			ENOMEM_compression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_decompression_workspace_init)	\
+	x(ENOMEM,			ENOMEM_bucket_gens)			\
+	x(ENOMEM,			ENOMEM_buckets_nouse)			\
+	x(ENOMEM,			ENOMEM_usage_init)			\
+	x(ENOMEM,			ENOMEM_btree_node_read_all_replicas)	\
+	x(ENOMEM,			ENOMEM_btree_node_reclaim)		\
+	x(ENOMEM,			ENOMEM_btree_node_mem_alloc)		\
+	x(ENOMEM,			ENOMEM_btree_cache_cannibalize_lock)	\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_init)\
+	x(ENOMEM,			ENOMEM_buckets_waiting_for_journal_set)	\
+	x(ENOMEM,			ENOMEM_set_nr_journal_buckets)		\
+	x(ENOMEM,			ENOMEM_dev_journal_init)		\
+	x(ENOMEM,			ENOMEM_journal_pin_fifo)		\
+	x(ENOMEM,			ENOMEM_journal_buf)			\
+	x(ENOMEM,			ENOMEM_gc_start)			\
+	x(ENOMEM,			ENOMEM_gc_alloc_start)			\
+	x(ENOMEM,			ENOMEM_gc_reflink_start)		\
+	x(ENOMEM,			ENOMEM_gc_gens)				\
+	x(ENOMEM,			ENOMEM_gc_repair_key)			\
+	x(ENOMEM,			ENOMEM_fsck_extent_ends_at)		\
+	x(ENOMEM,			ENOMEM_fsck_add_nlink)			\
+	x(ENOMEM,			ENOMEM_journal_key_insert)		\
+	x(ENOMEM,			ENOMEM_journal_keys_sort)		\
+	x(ENOMEM,			ENOMEM_journal_replay)			\
+	x(ENOMEM,			ENOMEM_read_superblock_clean)		\
+	x(ENOMEM,			ENOMEM_fs_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
+	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
+	x(ENOMEM,			ENOMEM_dev_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_reservation)		\
+	x(ENOSPC,			ENOSPC_bucket_alloc)			\
+	x(ENOSPC,			ENOSPC_disk_label_add)			\
+	x(ENOSPC,			ENOSPC_stripe_create)			\
+	x(ENOSPC,			ENOSPC_inode_create)			\
+	x(ENOSPC,			ENOSPC_str_hash_create)			\
+	x(ENOSPC,			ENOSPC_snapshot_create)			\
+	x(ENOSPC,			ENOSPC_subvolume_create)		\
+	x(ENOSPC,			ENOSPC_sb)				\
+	x(ENOSPC,			ENOSPC_sb_journal)			\
+	x(ENOSPC,			ENOSPC_sb_journal_seq_blacklist)	\
+	x(ENOSPC,			ENOSPC_sb_quota)			\
+	x(ENOSPC,			ENOSPC_sb_replicas)			\
+	x(ENOSPC,			ENOSPC_sb_members)			\
+	x(ENOSPC,			ENOSPC_sb_members_v2)			\
+	x(ENOSPC,			ENOSPC_sb_crypt)			\
+	x(ENOSPC,			ENOSPC_sb_downgrade)			\
+	x(ENOSPC,			ENOSPC_btree_slot)			\
+	x(ENOSPC,			ENOSPC_snapshot_tree)			\
+	x(ENOENT,			ENOENT_bkey_type_mismatch)		\
+	x(ENOENT,			ENOENT_str_hash_lookup)			\
+	x(ENOENT,			ENOENT_str_hash_set_must_replace)	\
+	x(ENOENT,			ENOENT_inode)				\
+	x(ENOENT,			ENOENT_not_subvol)			\
+	x(ENOENT,			ENOENT_not_directory)			\
+	x(ENOENT,			ENOENT_directory_dead)			\
+	x(ENOENT,			ENOENT_subvolume)			\
+	x(ENOENT,			ENOENT_snapshot_tree)			\
+	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
+	x(ENOENT,			ENOENT_dev_not_found)			\
+	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(0,				open_buckets_empty)			\
+	x(0,				freelist_empty)				\
+	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
+	x(0,				transaction_restart)			\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fault_inject)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_path_intent)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_relock_after_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_too_many_iters)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_lock_node_reused)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_relock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_fill_mem_alloc_fail)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_mem_realloced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_in_traverse_all)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_would_deadlock_write)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_deadlock_recursion_limit)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_upgrade)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_upgrade)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_fill)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
+	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
+	x(BCH_ERR_transaction_restart,	transaction_restart_write_buffer_flush)	\
+	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
+	x(0,				no_btree_node)				\
+	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_upgrade)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_drop)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_lock_root)		\
+	x(BCH_ERR_no_btree_node,	no_btree_node_up)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_down)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_init)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_cached)			\
+	x(BCH_ERR_no_btree_node,	no_btree_node_srcu_reset)		\
+	x(0,				btree_insert_fail)			\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_btree_node_full)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
+	x(BCH_ERR_btree_insert_fail,	btree_insert_need_flush_buffer)		\
+	x(0,				backpointer_to_overwritten_btree_node)	\
+	x(0,				lock_fail_root_changed)			\
+	x(0,				journal_reclaim_would_deadlock)		\
+	x(EINVAL,			fsck)					\
+	x(BCH_ERR_fsck,			fsck_fix)				\
+	x(BCH_ERR_fsck,			fsck_ignore)				\
+	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
+	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
+	x(BCH_ERR_fsck,			fsck_repair_impossible)			\
+	x(0,				restart_recovery)			\
+	x(0,				data_update_done)			\
+	x(EINVAL,			device_state_not_allowed)		\
+	x(EINVAL,			member_info_missing)			\
+	x(EINVAL,			mismatched_block_size)			\
+	x(EINVAL,			block_size_too_small)			\
+	x(EINVAL,			bucket_size_too_small)			\
+	x(EINVAL,			device_size_too_small)			\
+	x(EINVAL,			device_not_a_member_of_filesystem)	\
+	x(EINVAL,			device_has_been_removed)		\
+	x(EINVAL,			device_already_online)			\
+	x(EINVAL,			insufficient_devices_to_start)		\
+	x(EINVAL,			invalid)				\
+	x(EINVAL,			internal_fsck_err)			\
+	x(EROFS,			erofs_trans_commit)			\
+	x(EROFS,			erofs_no_writes)			\
+	x(EROFS,			erofs_journal_err)			\
+	x(EROFS,			erofs_sb_err)				\
+	x(EROFS,			erofs_unfixed_errors)			\
+	x(EROFS,			erofs_norecovery)			\
+	x(EROFS,			erofs_nochanges)			\
+	x(EROFS,			insufficient_devices)			\
+	x(0,				operation_blocked)			\
+	x(BCH_ERR_operation_blocked,	btree_cache_cannibalize_lock_blocked)	\
+	x(BCH_ERR_operation_blocked,	journal_res_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	journal_preres_get_blocked)		\
+	x(BCH_ERR_operation_blocked,	bucket_alloc_blocked)			\
+	x(BCH_ERR_operation_blocked,	stripe_alloc_blocked)			\
+	x(BCH_ERR_invalid,		invalid_sb)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_magic)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_version)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_features)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_big)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum_type)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_csum)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_block_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_uuid)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_too_many_members)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_dev_idx)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_time_precision)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_field_size)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_layout)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_type)			\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_nr_superblocks)	\
+	x(BCH_ERR_invalid_sb_layout,	invalid_sb_layout_superblocks_overlap)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members_missing)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_members)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_disk_groups)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_replicas)			\
+	x(BCH_ERR_invalid_sb,		invalid_replicas_entry)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_journal_seq_blacklist)	\
+	x(BCH_ERR_invalid_sb,		invalid_sb_crypt)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_clean)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_errors)			\
+	x(BCH_ERR_invalid_sb,		invalid_sb_opt_compression)		\
+	x(BCH_ERR_invalid_sb,		invalid_sb_ext)				\
+	x(BCH_ERR_invalid_sb,		invalid_sb_downgrade)			\
+	x(BCH_ERR_invalid,		invalid_bkey)				\
+	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
+	x(EIO,				btree_node_read_err)			\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
+	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)	\
+	x(0,				nopromote)				\
+	x(BCH_ERR_nopromote,		nopromote_may_not)			\
+	x(BCH_ERR_nopromote,		nopromote_already_promoted)		\
+	x(BCH_ERR_nopromote,		nopromote_unwritten)			\
+	x(BCH_ERR_nopromote,		nopromote_congested)			\
+	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
+	x(BCH_ERR_nopromote,		nopromote_enomem)
+
+enum bch_errcode {
+	BCH_ERR_START		= 2048,
+#define x(class, err) BCH_ERR_##err,
+	BCH_ERRCODES()
+#undef x
+	BCH_ERR_MAX
+};
+
+const char *bch2_err_str(int);
+bool __bch2_err_matches(int, int);
+
+static inline bool _bch2_err_matches(int err, int class)
+{
+	return err < 0 && __bch2_err_matches(err, class);
+}
+
+#define bch2_err_matches(_err, _class)			\
+({							\
+	BUILD_BUG_ON(!__builtin_constant_p(_class));	\
+	unlikely(_bch2_err_matches(_err, _class));	\
+})
+
+int __bch2_err_class(int);
+
+static inline long bch2_err_class(long err)
+{
+	return err < 0 ? __bch2_err_class(err) : err;
+}
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
+#endif /* _BCACHFES_ERRCODE_H */
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
new file mode 100644
index 000000000000..25cf78a7b946
--- /dev/null
+++ b/fs/bcachefs/error.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "super.h"
+
+#define FSCK_ERR_RATELIMIT_NR	10
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_continue:
+		return false;
+	case BCH_ON_ERROR_ro:
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "inconsistency detected - emergency read only");
+		return true;
+	case BCH_ON_ERROR_panic:
+		panic(bch2_fmt(c, "panic after error"));
+		return true;
+	default:
+		BUG();
+	}
+}
+
+void bch2_topology_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		bch2_inconsistent_error(c);
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+	if (bch2_fs_emergency_read_only(c))
+		bch_err(c, "fatal error - emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+	struct bch_fs *c = ca->fs;
+	bool dev;
+
+	down_write(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
+				    BCH_FORCE_IF_DEGRADED);
+	if (dev
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+				  BCH_FORCE_IF_DEGRADED)
+	    : bch2_fs_emergency_read_only(c))
+		bch_err(ca,
+			"too many IO errors, setting %s RO",
+			dev ? "device" : "filesystem");
+	up_write(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
+{
+	atomic64_inc(&ca->errors[type]);
+	//queue_work(system_long_wq, &ca->io_error_work);
+}
+
+enum ask_yn {
+	YN_NO,
+	YN_YES,
+	YN_ALLNO,
+	YN_ALLYES,
+};
+
+#ifdef __KERNEL__
+#define bch2_fsck_ask_yn()	YN_NO
+#else
+
+#include "tools-util.h"
+
+enum ask_yn bch2_fsck_ask_yn(void)
+{
+	char *buf = NULL;
+	size_t buflen = 0;
+	bool ret;
+
+	while (true) {
+		fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
+		fflush(stdout);
+
+		if (getline(&buf, &buflen, stdin) < 0)
+			die("error reading from standard input");
+
+		strim(buf);
+		if (strlen(buf) != 1)
+			continue;
+
+		switch (buf[0]) {
+		case 'n':
+			return YN_NO;
+		case 'y':
+			return YN_YES;
+		case 'N':
+			return YN_ALLNO;
+		case 'Y':
+			return YN_ALLYES;
+		}
+	}
+
+	free(buf);
+	return ret;
+}
+
+#endif
+
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
+{
+	struct fsck_err_state *s;
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		return NULL;
+
+	list_for_each_entry(s, &c->fsck_error_msgs, list)
+		if (s->fmt == fmt) {
+			/*
+			 * move it to the head of the list: repeated fsck errors
+			 * are common
+			 */
+			list_move(&s->list, &c->fsck_error_msgs);
+			return s;
+		}
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s) {
+		if (!c->fsck_alloc_msgs_err)
+			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+		c->fsck_alloc_msgs_err = true;
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&s->list);
+	s->fmt = fmt;
+	list_add(&s->list, &c->fsck_error_msgs);
+	return s;
+}
+
+int bch2_fsck_err(struct bch_fs *c,
+		  enum bch_fsck_flags flags,
+		  enum bch_sb_error_id err,
+		  const char *fmt, ...)
+{
+	struct fsck_err_state *s = NULL;
+	va_list args;
+	bool print = true, suppressing = false, inconsistent = false;
+	struct printbuf buf = PRINTBUF, *out = &buf;
+	int ret = -BCH_ERR_fsck_ignore;
+
+	if (test_bit(err, c->sb.errors_silent))
+		return -BCH_ERR_fsck_fix;
+
+	bch2_sb_error_count(c, err);
+
+	va_start(args, fmt);
+	prt_vprintf(out, fmt, args);
+	va_end(args);
+
+	mutex_lock(&c->fsck_error_msgs_lock);
+	s = fsck_err_get(c, fmt);
+	if (s) {
+		/*
+		 * We may be called multiple times for the same error on
+		 * transaction restart - this memoizes instead of asking the user
+		 * multiple times for the same error:
+		 */
+		if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
+			ret = s->ret;
+			mutex_unlock(&c->fsck_error_msgs_lock);
+			printbuf_exit(&buf);
+			return ret;
+		}
+
+		kfree(s->last_msg);
+		s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
+
+		if (c->opts.ratelimit_errors &&
+		    !(flags & FSCK_NO_RATELIMIT) &&
+		    s->nr >= FSCK_ERR_RATELIMIT_NR) {
+			if (s->nr == FSCK_ERR_RATELIMIT_NR)
+				suppressing = true;
+			else
+				print = false;
+		}
+
+		s->nr++;
+	}
+
+#ifdef BCACHEFS_LOG_PREFIX
+	if (!strncmp(fmt, "bcachefs:", 9))
+		prt_printf(out, bch2_log_msg(c, ""));
+#endif
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+		if (c->opts.errors != BCH_ON_ERROR_continue ||
+		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
+			prt_str(out, ", shutting down");
+			inconsistent = true;
+			ret = -BCH_ERR_fsck_errors_not_fixed;
+		} else if (flags & FSCK_CAN_FIX) {
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", continuing");
+			ret = -BCH_ERR_fsck_ignore;
+		}
+	} else if (c->opts.fix_errors == FSCK_FIX_exit) {
+		prt_str(out, ", exiting");
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+	} else if (flags & FSCK_CAN_FIX) {
+		int fix = s && s->fix
+			? s->fix
+			: c->opts.fix_errors;
+
+		if (fix == FSCK_FIX_ask) {
+			int ask;
+
+			prt_str(out, ": fix?");
+			bch2_print_string_as_lines(KERN_ERR, out->buf);
+			print = false;
+
+			ask = bch2_fsck_ask_yn();
+
+			if (ask >= YN_ALLNO && s)
+				s->fix = ask == YN_ALLNO
+					? FSCK_FIX_no
+					: FSCK_FIX_yes;
+
+			ret = ask & 1
+				? -BCH_ERR_fsck_fix
+				: -BCH_ERR_fsck_ignore;
+		} else if (fix == FSCK_FIX_yes ||
+			   (c->opts.nochanges &&
+			    !(flags & FSCK_CAN_IGNORE))) {
+			prt_str(out, ", fixing");
+			ret = -BCH_ERR_fsck_fix;
+		} else {
+			prt_str(out, ", not fixing");
+		}
+	} else if (flags & FSCK_NEED_FSCK) {
+		prt_str(out, " (run fsck to correct)");
+	} else {
+		prt_str(out, " (repair unimplemented)");
+	}
+
+	if (ret == -BCH_ERR_fsck_ignore &&
+	    (c->opts.fix_errors == FSCK_FIX_exit ||
+	     !(flags & FSCK_CAN_IGNORE)))
+		ret = -BCH_ERR_fsck_errors_not_fixed;
+
+	if (print)
+		bch2_print_string_as_lines(KERN_ERR, out->buf);
+
+	if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+	    (ret != -BCH_ERR_fsck_fix &&
+	     ret != -BCH_ERR_fsck_ignore))
+		bch_err(c, "Unable to continue, halting");
+	else if (suppressing)
+		bch_err(c, "Ratelimiting new instances of previous error");
+
+	if (s)
+		s->ret = ret;
+
+	mutex_unlock(&c->fsck_error_msgs_lock);
+
+	printbuf_exit(&buf);
+
+	if (inconsistent)
+		bch2_inconsistent_error(c);
+
+	if (ret == -BCH_ERR_fsck_fix) {
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+	} else {
+		set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+		set_bit(BCH_FS_ERROR, &c->flags);
+	}
+
+	return ret;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+	struct fsck_err_state *s, *n;
+
+	mutex_lock(&c->fsck_error_msgs_lock);
+
+	list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
+		if (s->ratelimited && s->last_msg)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
+
+		list_del(&s->list);
+		kfree(s->last_msg);
+		kfree(s);
+	}
+
+	mutex_unlock(&c->fsck_error_msgs_lock);
+}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
new file mode 100644
index 000000000000..fec17d1353d1
--- /dev/null
+++ b/fs/bcachefs/error.h
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+#include "sb-errors.h"
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+void bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)					\
+({									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_inconsistent_error(c);					\
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)					\
+do {									\
+	bch_err(ca, __VA_ARGS__);					\
+	bch2_inconsistent_error((ca)->fs);				\
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * When a transaction update discovers or is causing a fs inconsistency, it's
+ * helpful to also dump the pending updates:
+ */
+#define bch2_trans_inconsistent(trans, ...)				\
+({									\
+	bch_err(trans->c, __VA_ARGS__);					\
+	bch2_dump_trans_updates(trans);					\
+	bch2_inconsistent_error(trans->c);				\
+})
+
+#define bch2_trans_inconsistent_on(cond, trans, ...)			\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+struct fsck_err_state {
+	struct list_head	list;
+	const char		*fmt;
+	u64			nr;
+	bool			ratelimited;
+	int			ret;
+	int			fix;
+	char			*last_msg;
+};
+
+enum bch_fsck_flags {
+	FSCK_CAN_FIX		= 1 << 0,
+	FSCK_CAN_IGNORE		= 1 << 1,
+	FSCK_NEED_FSCK		= 1 << 2,
+	FSCK_NO_RATELIMIT	= 1 << 3,
+};
+
+#define fsck_err_count(_c, _err)	bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
+
+__printf(4, 5) __cold
+int bch2_fsck_err(struct bch_fs *,
+		  enum bch_fsck_flags,
+		  enum bch_sb_error_id,
+		  const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, _err_type, ...)				\
+({									\
+	int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type,	\
+				 __VA_ARGS__);				\
+									\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore) {				\
+		ret = _ret;						\
+		goto fsck_err;						\
+	}								\
+									\
+	_ret == -BCH_ERR_fsck_fix;					\
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, _err_type, ...)			\
+	(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, _err_type, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define need_fsck_err(c, _err_type, ...)				\
+	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err(c, _err_type, ...)				\
+	__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, _err_type, ...)			\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define fsck_err(c, _err_type, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+#define fsck_err_on(cond, c, _err_type, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
+
+__printf(4, 0)
+static inline void bch2_bkey_fsck_err(struct bch_fs *c,
+				     struct printbuf *err_msg,
+				     enum bch_sb_error_id err_type,
+				     const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	prt_vprintf(err_msg, fmt, args);
+	va_end(args);
+}
+
+#define bkey_fsck_err(c, _err_msg, _err_type, ...)			\
+do {									\
+	prt_printf(_err_msg, __VA_ARGS__);				\
+	bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type);		\
+	ret = -BCH_ERR_invalid_bkey;					\
+	goto fsck_err;							\
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...)					\
+do {									\
+	if (unlikely(cond))						\
+		bkey_fsck_err(__VA_ARGS__);				\
+} while (0)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_fatal_error(c);						\
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)				\
+({									\
+	bool _ret = unlikely(!!(cond));					\
+									\
+	if (_ret)							\
+		bch2_fs_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
+
+#define bch2_dev_io_err_on(cond, ca, _type, ...)			\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret) {							\
+		bch_err_dev_ratelimited(ca, __VA_ARGS__);		\
+		bch2_io_error(ca, _type);				\
+	}								\
+	_ret;								\
+})
+
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...)			\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret) {							\
+		bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);	\
+		bch2_io_error(ca, _type);				\
+	}								\
+	_ret;								\
+})
+
+#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
new file mode 100644
index 000000000000..21af6fb8cecf
--- /dev/null
+++ b/fs/bcachefs/extent_update.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	unsigned ret = 0, lru = 0;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			/* Might also be updating LRU btree */
+			if (entry->ptr.cached)
+				lru++;
+
+			fallthrough;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			ret++;
+		}
+	}
+
+	/*
+	 * Updating keys in the alloc btree may also update keys in the
+	 * freespace or discard btrees:
+	 */
+	return lru + ret * 2;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+				  struct bkey_s_c k,
+				  unsigned offset,
+				  struct bpos *end,
+				  unsigned *nr_iters,
+				  unsigned max_iters)
+{
+	int ret = 0, ret2 = 0;
+
+	if (*nr_iters >= max_iters) {
+		*end = bpos_min(*end, k.k->p);
+		ret = 1;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+		if (*nr_iters >= max_iters) {
+			*end = bpos_min(*end, k.k->p);
+			ret = 1;
+		}
+
+		break;
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+		u64 idx = le64_to_cpu(p.v->idx);
+		unsigned sectors = bpos_min(*end, p.k->p).offset -
+			bkey_start_offset(p.k);
+		struct btree_iter iter;
+		struct bkey_s_c r_k;
+
+		for_each_btree_key_norestart(trans, iter,
+				   BTREE_ID_reflink, POS(0, idx + offset),
+				   BTREE_ITER_SLOTS, r_k, ret2) {
+			if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors)))
+				break;
+
+			/* extent_update_to_keys(), for the reflink_v update */
+			*nr_iters += 1;
+
+			*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+			if (*nr_iters >= max_iters) {
+				struct bpos pos = bkey_start_pos(k.k);
+				pos.offset += min_t(u64, k.k->size,
+						    r_k.k->p.offset - idx);
+
+				*end = bpos_min(*end, pos);
+				ret = 1;
+				break;
+			}
+		}
+		bch2_trans_iter_exit(trans, &iter);
+
+		break;
+	}
+	}
+
+	return ret2 ?: ret;
+}
+
+#define EXTENT_ITERS_MAX	(BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert,
+			   struct bpos *end)
+{
+	struct btree_iter copy;
+	struct bkey_s_c k;
+	unsigned nr_iters = 0;
+	int ret;
+
+	ret = bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	*end = insert->k.p;
+
+	/* extent_update_to_keys(): */
+	nr_iters += 1;
+
+	ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+				     &nr_iters, EXTENT_ITERS_MAX / 2);
+	if (ret < 0)
+		return ret;
+
+	bch2_trans_copy_iter(&copy, iter);
+
+	for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
+		unsigned offset = 0;
+
+		if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
+			offset = bkey_start_offset(&insert->k) -
+				bkey_start_offset(k.k);
+
+		/* extent_handle_overwrites(): */
+		switch (bch2_extent_overlap(&insert->k, k.k)) {
+		case BCH_EXTENT_OVERLAP_ALL:
+		case BCH_EXTENT_OVERLAP_FRONT:
+			nr_iters += 1;
+			break;
+		case BCH_EXTENT_OVERLAP_BACK:
+		case BCH_EXTENT_OVERLAP_MIDDLE:
+			nr_iters += 2;
+			break;
+		}
+
+		ret = count_iters_for_insert(trans, k, offset, end,
+					&nr_iters, EXTENT_ITERS_MAX);
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &copy);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i *k)
+{
+	struct bpos end;
+	int ret;
+
+	ret = bch2_extent_atomic_end(trans, iter, k, &end);
+	if (ret)
+		return ret;
+
+	bch2_cut_back(end, k);
+	return 0;
+}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
new file mode 100644
index 000000000000..6f5cf449361a
--- /dev/null
+++ b/fs/bcachefs/extent_update.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+			   struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+			    struct bkey_i *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
new file mode 100644
index 000000000000..9d8afcb5979a
--- /dev/null
+++ b/fs/bcachefs/extents.c
@@ -0,0 +1,1511 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Code for managing the extent btree and dynamically updating the writeback
+ * dirty sector count.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "compress.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+#include "util.h"
+
+static unsigned bch2_crc_field_size_max[] = {
+	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
+
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+				 struct bch_extent_crc_unpacked,
+				 enum bch_extent_entry_type);
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+						   unsigned dev)
+{
+	struct bch_dev_io_failures *i;
+
+	for (i = f->devs; i < f->devs + f->nr; i++)
+		if (i->dev == dev)
+			return i;
+
+	return NULL;
+}
+
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+			  struct extent_ptr_decoded *p)
+{
+	struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+
+	if (!f) {
+		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+		f = &failed->devs[failed->nr++];
+		f->dev		= p->ptr.dev;
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else if (p->idx != f->idx) {
+		f->idx		= p->idx;
+		f->nr_failed	= 1;
+		f->nr_retries	= 0;
+	} else {
+		f->nr_failed++;
+	}
+}
+
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+			      const struct extent_ptr_decoded p1,
+			      const struct extent_ptr_decoded p2)
+{
+	if (likely(!p1.idx && !p2.idx)) {
+		struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+		struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+		u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+		u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+
+		/* Pick at random, biased in favor of the faster device: */
+
+		return bch2_rand_range(l1 + l2) > l1;
+	}
+
+	if (bch2_force_reconstruct_read)
+		return p1.idx > p2.idx;
+
+	return p1.idx < p2.idx;
+}
+
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct bch_dev_io_failures *f;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (k.k->type == KEY_TYPE_error)
+		return -EIO;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		/*
+		 * Unwritten extent: no need to actually read, treat it as a
+		 * hole and return 0s:
+		 */
+		if (p.ptr.unwritten)
+			return 0;
+
+		ca = bch_dev_bkey_exists(c, p.ptr.dev);
+
+		/*
+		 * If there are any dirty pointers it's an error if we can't
+		 * read:
+		 */
+		if (!ret && !p.ptr.cached)
+			ret = -EIO;
+
+		if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+			continue;
+
+		f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (f)
+			p.idx = f->nr_failed < f->nr_retries
+				? f->idx
+				: f->idx + 1;
+
+		if (!p.idx &&
+		    !bch2_dev_is_readable(ca))
+			p.idx++;
+
+		if (bch2_force_reconstruct_read &&
+		    !p.idx && p.has_ec)
+			p.idx++;
+
+		if (p.idx >= (unsigned) p.has_ec + 1)
+			continue;
+
+		if (ret > 0 && !ptr_better(c, p, *pick))
+			continue;
+
+		*pick = p;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+			 btree_ptr_val_too_big,
+			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+
+	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+	return ret;
+}
+
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+			      enum bkey_invalid_flags flags,
+			      struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+			 btree_ptr_v2_val_too_big,
+			 "value too big (%zu > %zu)",
+			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+	return ret;
+}
+
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct bkey_s_c k)
+{
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+
+	prt_printf(out, "seq %llx written %u min_key %s",
+	       le64_to_cpu(bp.v->seq),
+	       le16_to_cpu(bp.v->sectors_written),
+	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
+
+	bch2_bpos_to_text(out, bp.v->min_key);
+	prt_printf(out, " ");
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
+			      unsigned big_endian, int write,
+			      struct bkey_s k)
+{
+	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
+
+	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
+
+	if (version < bcachefs_metadata_version_inode_btree_change &&
+	    btree_id_is_extents(btree_id) &&
+	    !bkey_eq(bp.v->min_key, POS_MIN))
+		bp.v->min_key = write
+			? bpos_nosnap_predecessor(bp.v->min_key)
+			: bpos_nosnap_successor(bp.v->min_key);
+}
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
+	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
+	union bch_extent_entry *en_l;
+	const union bch_extent_entry *en_r;
+	struct extent_ptr_decoded lp, rp;
+	bool use_right_ptr;
+	struct bch_dev *ca;
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_type(en_l) != extent_entry_type(en_r))
+			return false;
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
+		return false;
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
+	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+
+	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
+	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
+		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
+		    rp.ptr.offset + rp.crc.offset ||
+		    lp.ptr.dev			!= rp.ptr.dev ||
+		    lp.ptr.gen			!= rp.ptr.gen ||
+		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
+		    lp.has_ec			!= rp.has_ec)
+			return false;
+
+		/* Extents may not straddle buckets: */
+		ca = bch_dev_bkey_exists(c, lp.ptr.dev);
+		if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr))
+			return false;
+
+		if (lp.has_ec			!= rp.has_ec ||
+		    (lp.has_ec &&
+		     (lp.ec.block		!= rp.ec.block ||
+		      lp.ec.redundancy		!= rp.ec.redundancy ||
+		      lp.ec.idx			!= rp.ec.idx)))
+			return false;
+
+		if (lp.crc.compression_type	!= rp.crc.compression_type ||
+		    lp.crc.nonce		!= rp.crc.nonce)
+			return false;
+
+		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
+		    lp.crc.uncompressed_size) {
+			/* can use left extent's crc entry */
+		} else if (lp.crc.live_size <= rp.crc.offset) {
+			/* can use right extent's crc entry */
+		} else {
+			/* check if checksums can be merged: */
+			if (lp.crc.csum_type		!= rp.crc.csum_type ||
+			    lp.crc.nonce		!= rp.crc.nonce ||
+			    crc_is_compressed(lp.crc) ||
+			    !bch2_checksum_mergeable(lp.crc.csum_type))
+				return false;
+
+			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
+			    rp.crc.offset)
+				return false;
+
+			if (lp.crc.csum_type &&
+			    lp.crc.uncompressed_size +
+			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
+				return false;
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
+			    bch2_crc_field_size_max[extent_entry_type(en_l)])
+				return false;
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	use_right_ptr = false;
+	en_l = l_ptrs.start;
+	en_r = r_ptrs.start;
+	while (en_l < l_ptrs.end) {
+		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
+		    use_right_ptr)
+			en_l->ptr = en_r->ptr;
+
+		if (extent_entry_is_crc(en_l)) {
+			struct bch_extent_crc_unpacked crc_l =
+				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+			struct bch_extent_crc_unpacked crc_r =
+				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+			use_right_ptr = false;
+
+			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
+			    crc_l.uncompressed_size) {
+				/* can use left extent's crc entry */
+			} else if (crc_l.live_size <= crc_r.offset) {
+				/* can use right extent's crc entry */
+				crc_r.offset -= crc_l.live_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
+						     extent_entry_type(en_l));
+				use_right_ptr = true;
+			} else {
+				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+								 crc_l.csum,
+								 crc_r.csum,
+								 crc_r.uncompressed_size << 9);
+
+				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
+				crc_l.compressed_size	+= crc_r.compressed_size;
+				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+						     extent_entry_type(en_l));
+			}
+		}
+
+		en_l = extent_entry_next(en_l);
+		en_r = extent_entry_next(en_r);
+	}
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
+			     enum bkey_invalid_flags flags,
+			     struct printbuf *err)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+			 reservation_key_nr_replicas_invalid,
+			 "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+	return ret;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+			      struct bkey_s_c k)
+{
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+	prt_printf(out, "generation %u replicas %u",
+	       le32_to_cpu(r.v->generation),
+	       r.v->nr_replicas);
+}
+
+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
+
+	if (l.v->generation != r.v->generation ||
+	    l.v->nr_replicas != r.v->nr_replicas)
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
+{
+	return !crc_is_compressed(u) &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
+				 struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	if (!n.csum_type)
+		return false;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
+}
+
+/*
+ * We're writing another replica for this extent, so while we've got the data in
+ * memory we'll be computing a new checksum for the currently live data.
+ *
+ * If there are other replicas we aren't moving, and they are checksummed but
+ * not compressed, we can modify them to point to only the data that is
+ * currently live (so that readers won't have to bounce) while we've got the
+ * checksum we need:
+ */
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked u;
+	struct extent_ptr_decoded p;
+	union bch_extent_entry *i;
+	bool ret = false;
+
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type) {
+		bkey_for_each_crc(&k->k, ptrs, u, i)
+			if (!crc_is_compressed(u) &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
+				goto found;
+			}
+		return false;
+	}
+found:
+	BUG_ON(crc_is_compressed(n));
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != k->k.size);
+
+restart_narrow_pointers:
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
+		if (can_narrow_crc(p.crc, n)) {
+			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
+			p.ptr.offset += p.crc.offset;
+			p.crc = n;
+			bch2_extent_ptr_decoded_append(k, &p);
+			ret = true;
+			goto restart_narrow_pointers;
+		}
+
+	return ret;
+}
+
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+				 struct bch_extent_crc_unpacked src,
+				 enum bch_extent_entry_type type)
+{
+#define set_common_fields(_dst, _src)					\
+		_dst.type		= 1 << type;			\
+		_dst.csum_type		= _src.csum_type,		\
+		_dst.compression_type	= _src.compression_type,	\
+		_dst._compressed_size	= _src.compressed_size - 1,	\
+		_dst._uncompressed_size	= _src.uncompressed_size - 1,	\
+		_dst.offset		= _src.offset
+
+	switch (type) {
+	case BCH_EXTENT_ENTRY_crc32:
+		set_common_fields(dst->crc32, src);
+		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
+		break;
+	case BCH_EXTENT_ENTRY_crc64:
+		set_common_fields(dst->crc64, src);
+		dst->crc64.nonce	= src.nonce;
+		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
+		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
+		break;
+	case BCH_EXTENT_ENTRY_crc128:
+		set_common_fields(dst->crc128, src);
+		dst->crc128.nonce	= src.nonce;
+		dst->crc128.csum	= src.csum;
+		break;
+	default:
+		BUG();
+	}
+#undef set_common_fields
+}
+
+void bch2_extent_crc_append(struct bkey_i *k,
+			    struct bch_extent_crc_unpacked new)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_crc *crc = (void *) ptrs.end;
+	enum bch_extent_entry_type type;
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc32;
+	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
+		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
+		   new.nonce			<= CRC64_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc64;
+	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
+		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
+		   new.nonce			<= CRC128_NONCE_MAX)
+		type = BCH_EXTENT_ENTRY_crc128;
+	else
+		BUG();
+
+	bch2_extent_crc_pack(crc, new, type);
+
+	k->k.u64s += extent_entry_u64s(ptrs.end);
+
+	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+}
+
+/* Generic code for keys with pointers: */
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
+{
+	return bch2_bkey_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_reservation
+		? bkey_s_c_to_reservation(k).v->nr_replicas
+		: bch2_bkey_dirty_devs(k).nr;
+}
+
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
+{
+	unsigned ret = 0;
+
+	if (k.k->type == KEY_TYPE_reservation) {
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+	} else {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
+	}
+
+	return ret;
+}
+
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned ret = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (!p.ptr.cached && crc_is_compressed(p.crc))
+			ret += p.crc.compressed_size;
+
+	return ret;
+}
+
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+
+	bkey_for_each_crc(k.k, ptrs, crc, entry)
+		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+			return true;
+	return false;
+}
+
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p = { 0 };
+	unsigned replicas = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (p.has_ec)
+			replicas += p.ec.redundancy;
+
+		replicas++;
+
+	}
+
+	return replicas;
+}
+
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
+{
+	if (p->ptr.cached)
+		return 0;
+
+	return p->has_ec
+		? p->ec.redundancy + 1
+		: ca->mi.durability;
+}
+
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+	return __extent_ptr_durability(ca, p);
+}
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+	if (ca->mi.state == BCH_MEMBER_STATE_failed)
+		return 0;
+
+	return __extent_ptr_durability(ca, p);
+}
+
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		durability += bch2_extent_ptr_durability(c, &p);
+
+	return durability;
+}
+
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned durability = 0;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+			durability += bch2_extent_ptr_durability(c, &p);
+
+	return durability;
+}
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+	k->k.u64s -= extent_entry_u64s(entry);
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
+				    struct extent_ptr_decoded *p)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked crc =
+		bch2_extent_crc_unpack(&k->k, NULL);
+	union bch_extent_entry *pos;
+
+	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+		pos = ptrs.start;
+		goto found;
+	}
+
+	bkey_for_each_crc(&k->k, ptrs, crc, pos)
+		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+			pos = extent_entry_next(pos);
+			goto found;
+		}
+
+	bch2_extent_crc_append(k, p->crc);
+	pos = bkey_val_end(bkey_i_to_s(k));
+found:
+	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	__extent_entry_insert(k, pos, to_entry(&p->ptr));
+
+	if (p->has_ec) {
+		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+		__extent_entry_insert(k, pos, to_entry(&p->ec));
+	}
+}
+
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+					  union bch_extent_entry *entry)
+{
+	union bch_extent_entry *i = ptrs.start;
+
+	if (i == entry)
+		return NULL;
+
+	while (extent_entry_next(i) != entry)
+		i = extent_entry_next(i);
+	return i;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+						   struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry = to_entry(ptr), *next;
+	union bch_extent_entry *ret = entry;
+	bool drop_crc = true;
+
+	EBUG_ON(ptr < &ptrs.start->ptr ||
+		ptr >= &ptrs.end->ptr);
+	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+	for (next = extent_entry_next(entry);
+	     next != ptrs.end;
+	     next = extent_entry_next(next)) {
+		if (extent_entry_is_crc(next)) {
+			break;
+		} else if (extent_entry_is_ptr(next)) {
+			drop_crc = false;
+			break;
+		}
+	}
+
+	extent_entry_drop(k, entry);
+
+	while ((entry = extent_entry_prev(ptrs, entry))) {
+		if (extent_entry_is_ptr(entry))
+			break;
+
+		if ((extent_entry_is_crc(entry) && drop_crc) ||
+		    extent_entry_is_stripe_ptr(entry)) {
+			ret = (void *) ret - extent_entry_bytes(entry);
+			extent_entry_drop(k, entry);
+		}
+	}
+
+	return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+					   struct bch_extent_ptr *ptr)
+{
+	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+	union bch_extent_entry *ret =
+		bch2_bkey_drop_ptr_noerror(k, ptr);
+
+	/*
+	 * If we deleted all the dirty pointers and there's still cached
+	 * pointers, we could set the cached pointers to dirty if they're not
+	 * stale - but to do that correctly we'd need to grab an open_bucket
+	 * reference so that we don't race with bucket reuse:
+	 */
+	if (have_dirty &&
+	    !bch2_bkey_dirty_devs(k.s_c).nr) {
+		k.k->type = KEY_TYPE_error;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+		k.k->type = KEY_TYPE_deleted;
+		set_bkey_val_u64s(k.k, 0);
+		ret = NULL;
+	}
+
+	return ret;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+	struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+	if (ptr)
+		bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->dev == dev)
+			return ptr;
+
+	return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (bch2_dev_in_target(c, ptr->dev, target) &&
+		    (!ptr->cached ||
+		     !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+			return true;
+
+	return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+			   struct bch_extent_ptr m, u64 offset)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		if (p.ptr.dev	== m.dev &&
+		    p.ptr.gen	== m.gen &&
+		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+		    (s64) m.offset  - offset)
+			return true;
+
+	return false;
+}
+
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+	if (k1.k->type != k2.k->type)
+		return false;
+
+	if (bkey_extent_is_direct_data(k1.k)) {
+		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+		const union bch_extent_entry *entry1, *entry2;
+		struct extent_ptr_decoded p1, p2;
+
+		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+			return false;
+
+		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+				if (p1.ptr.dev		== p2.ptr.dev &&
+				    p1.ptr.gen		== p2.ptr.gen &&
+				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+					return true;
+
+		return false;
+	} else {
+		/* KEY_TYPE_deleted, etc. */
+		return true;
+	}
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+	union bch_extent_entry *entry2;
+	struct extent_ptr_decoded p2;
+
+	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+		if (p1.ptr.dev		== p2.ptr.dev &&
+		    p1.ptr.gen		== p2.ptr.gen &&
+		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+			return &entry2->ptr;
+
+	return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	union bch_extent_entry *ec = NULL;
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (&entry->ptr == ptr) {
+			ptr->cached = true;
+			if (ec)
+				extent_entry_drop(k, ec);
+			return;
+		}
+
+		if (extent_entry_is_stripe_ptr(entry))
+			ec = entry;
+		else if (extent_entry_is_ptr(entry))
+			ec = NULL;
+	}
+
+	BUG();
+}
+
+/*
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ *
+ * Returns true if @k should be dropped entirely
+ *
+ * For existing keys, only called when btree nodes are being rewritten, not when
+ * they're merely being compacted/resorted in memory.
+ */
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
+{
+	struct bch_extent_ptr *ptr;
+
+	bch2_bkey_drop_ptrs(k, ptr,
+		ptr->cached &&
+		ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
+
+	return bkey_deleted(k.k);
+}
+
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	bool first = true;
+
+	if (c)
+		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		if (!first)
+			prt_printf(out, " ");
+
+		switch (__extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr: {
+			const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+			struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+				? bch_dev_bkey_exists(c, ptr->dev)
+				: NULL;
+
+			if (!ca) {
+				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+				       (u64) ptr->offset, ptr->gen,
+				       ptr->cached ? " cached" : "");
+			} else {
+				u32 offset;
+				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+				prt_printf(out, "ptr: %u:%llu:%u gen %u",
+					   ptr->dev, b, offset, ptr->gen);
+				if (ptr->cached)
+					prt_str(out, " cached");
+				if (ptr->unwritten)
+					prt_str(out, " unwritten");
+				if (ca && ptr_stale(ca, ptr))
+					prt_printf(out, " stale");
+			}
+			break;
+		}
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128: {
+			struct bch_extent_crc_unpacked crc =
+				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+			       crc.compressed_size,
+			       crc.uncompressed_size,
+			       crc.offset, crc.nonce,
+			       bch2_csum_types[crc.csum_type],
+			       bch2_compression_types[crc.compression_type]);
+			break;
+		}
+		case BCH_EXTENT_ENTRY_stripe_ptr: {
+			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
+
+			prt_printf(out, "ec: idx %llu block %u",
+			       (u64) ec->idx, ec->block);
+			break;
+		}
+		case BCH_EXTENT_ENTRY_rebalance: {
+			const struct bch_extent_rebalance *r = &entry->rebalance;
+
+			prt_str(out, "rebalance: target ");
+			if (c)
+				bch2_target_to_text(out, c, r->target);
+			else
+				prt_printf(out, "%u", r->target);
+			prt_str(out, " compression ");
+			bch2_compression_opt_to_text(out, r->compression);
+			break;
+		}
+		default:
+			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+			return;
+		}
+
+		first = false;
+	}
+}
+
+static int extent_ptr_invalid(struct bch_fs *c,
+			      struct bkey_s_c k,
+			      enum bkey_invalid_flags flags,
+			      const struct bch_extent_ptr *ptr,
+			      unsigned size_ondisk,
+			      bool metadata,
+			      struct printbuf *err)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr2;
+	u64 bucket;
+	u32 bucket_offset;
+	struct bch_dev *ca;
+	int ret = 0;
+
+	if (!bch2_dev_exists2(c, ptr->dev)) {
+		/*
+		 * If we're in the write path this key might have already been
+		 * overwritten, and we could be seeing a device that doesn't
+		 * exist anymore due to racing with device removal:
+		 */
+		if (flags & BKEY_INVALID_WRITE)
+			return 0;
+
+		bkey_fsck_err(c, err, ptr_to_invalid_device,
+			   "pointer to invalid device (%u)", ptr->dev);
+	}
+
+	ca = bch_dev_bkey_exists(c, ptr->dev);
+	bkey_for_each_ptr(ptrs, ptr2)
+		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+				 ptr_to_duplicate_device,
+				 "multiple pointers to same device (%u)", ptr->dev);
+
+	bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
+
+	bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+			 ptr_after_last_bucket,
+			 "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+	bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+			 ptr_before_first_bucket,
+			 "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+	bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+			 ptr_spans_multiple_buckets,
+			 "pointer spans multiple buckets (%u + %u > %u)",
+		       bucket_offset, size_ondisk, ca->mi.bucket_size);
+fsck_err:
+	return ret;
+}
+
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	unsigned size_ondisk = k.k->size;
+	unsigned nonce = UINT_MAX;
+	unsigned nr_ptrs = 0;
+	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+	int ret = 0;
+
+	if (bkey_is_btree_ptr(k.k))
+		size_ondisk = btree_sectors(c);
+
+	bkey_extent_entry_for_each(ptrs, entry) {
+		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+			extent_ptrs_invalid_entry,
+			"invalid extent entry type (got %u, max %u)",
+			__extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+
+		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+				 !extent_entry_is_ptr(entry), c, err,
+				 btree_ptr_has_non_ptr,
+				 "has non ptr field");
+
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+						 size_ondisk, false, err);
+			if (ret)
+				return ret;
+
+			bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+					 ptr_cached_and_erasure_coded,
+					 "cached, erasure coded ptr");
+
+			if (!entry->ptr.unwritten)
+				have_written = true;
+			else
+				have_unwritten = true;
+
+			have_ec = false;
+			crc_since_last_ptr = false;
+			nr_ptrs++;
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+		case BCH_EXTENT_ENTRY_crc64:
+		case BCH_EXTENT_ENTRY_crc128:
+			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+					 ptr_crc_uncompressed_size_too_small,
+					 "checksum offset + key size > uncompressed size");
+			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+					 ptr_crc_csum_type_unknown,
+					 "invalid checksum type");
+			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+					 ptr_crc_compression_type_unknown,
+					 "invalid compression type");
+
+			if (bch2_csum_type_is_encryption(crc.csum_type)) {
+				if (nonce == UINT_MAX)
+					nonce = crc.offset + crc.nonce;
+				else if (nonce != crc.offset + crc.nonce)
+					bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+						      "incorrect nonce");
+			}
+
+			bkey_fsck_err_on(crc_since_last_ptr, c, err,
+					 ptr_crc_redundant,
+					 "redundant crc entry");
+			crc_since_last_ptr = true;
+
+			bkey_fsck_err_on(crc_is_encoded(crc) &&
+					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+					 (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+					 ptr_crc_uncompressed_size_too_big,
+					 "too large encoded extent");
+
+			size_ondisk = crc.compressed_size;
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			bkey_fsck_err_on(have_ec, c, err,
+					 ptr_stripe_redundant,
+					 "redundant stripe entry");
+			have_ec = true;
+			break;
+		case BCH_EXTENT_ENTRY_rebalance: {
+			const struct bch_extent_rebalance *r = &entry->rebalance;
+
+			if (!bch2_compression_opt_valid(r->compression)) {
+				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+				prt_printf(err, "invalid compression opt %u:%u",
+					   opt.type, opt.level);
+				return -BCH_ERR_invalid_bkey;
+			}
+			break;
+		}
+		}
+	}
+
+	bkey_fsck_err_on(!nr_ptrs, c, err,
+			 extent_ptrs_no_ptrs,
+			 "no ptrs");
+	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+			 extent_ptrs_too_many_ptrs,
+			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+	bkey_fsck_err_on(have_written && have_unwritten, c, err,
+			 extent_ptrs_written_and_unwritten,
+			 "extent with unwritten and written ptrs");
+	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+			 extent_ptrs_unwritten,
+			 "has unwritten ptrs");
+	bkey_fsck_err_on(crc_since_last_ptr, c, err,
+			 extent_ptrs_redundant_crc,
+			 "redundant crc entry");
+	bkey_fsck_err_on(have_ec, c, err,
+			 extent_ptrs_redundant_stripe,
+			 "redundant stripe entry");
+fsck_err:
+	return ret;
+}
+
+void bch2_ptr_swab(struct bkey_s k)
+{
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	union bch_extent_entry *entry;
+	u64 *d;
+
+	for (d =  (u64 *) ptrs.start;
+	     d != (u64 *) ptrs.end;
+	     d++)
+		*d = swab64(*d);
+
+	for (entry = ptrs.start;
+	     entry < ptrs.end;
+	     entry = extent_entry_next(entry)) {
+		switch (extent_entry_type(entry)) {
+		case BCH_EXTENT_ENTRY_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_crc32:
+			entry->crc32.csum = swab32(entry->crc32.csum);
+			break;
+		case BCH_EXTENT_ENTRY_crc64:
+			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+			break;
+		case BCH_EXTENT_ENTRY_crc128:
+			entry->crc128.csum.hi = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.hi);
+			entry->crc128.csum.lo = (__force __le64)
+				swab64((__force u64) entry->crc128.csum.lo);
+			break;
+		case BCH_EXTENT_ENTRY_stripe_ptr:
+			break;
+		case BCH_EXTENT_ENTRY_rebalance:
+			break;
+		}
+	}
+}
+
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+
+	bkey_extent_entry_for_each(ptrs, entry)
+		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+			return &entry->rebalance;
+
+	return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+				       unsigned target, unsigned compression)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	unsigned rewrite_ptrs = 0;
+
+	if (compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(compression);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		unsigned i = 0;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
+				rewrite_ptrs = 0;
+				goto incompressible;
+			}
+
+			if (!p.ptr.cached && p.crc.compression_type != compression_type)
+				rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
+incompressible:
+	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+		const struct bch_extent_ptr *ptr;
+		unsigned i = 0;
+
+		bkey_for_each_ptr(ptrs, ptr) {
+			if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+				rewrite_ptrs |= 1U << i;
+			i++;
+		}
+	}
+
+	return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+	/*
+	 * If it's an indirect extent, we don't delete the rebalance entry when
+	 * done so that we know what options were applied - check if it still
+	 * needs work done:
+	 */
+	if (r &&
+	    k.k->type == KEY_TYPE_reflink_v &&
+	    !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+		r = NULL;
+
+	return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+				  unsigned target, unsigned compression)
+{
+	struct bkey_s k = bkey_i_to_s(_k);
+	struct bch_extent_rebalance *r;
+	bool needs_rebalance;
+
+	if (!bkey_extent_is_direct_data(k.k))
+		return 0;
+
+	/* get existing rebalance entry: */
+	r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+	if (r) {
+		if (k.k->type == KEY_TYPE_reflink_v) {
+			/*
+			 * indirect extents: existing options take precedence,
+			 * so that we don't move extents back and forth if
+			 * they're referenced by different inodes with different
+			 * options:
+			 */
+			if (r->target)
+				target = r->target;
+			if (r->compression)
+				compression = r->compression;
+		}
+
+		r->target	= target;
+		r->compression	= compression;
+	}
+
+	needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+	if (needs_rebalance && !r) {
+		union bch_extent_entry *new = bkey_val_end(k);
+
+		new->rebalance.type		= 1U << BCH_EXTENT_ENTRY_rebalance;
+		new->rebalance.compression	= compression;
+		new->rebalance.target		= target;
+		new->rebalance.unused		= 0;
+		k.k->u64s += extent_entry_u64s(new);
+	} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+		/*
+		 * For indirect extents, don't delete the rebalance entry when
+		 * we're finished so that we know we specifically moved it or
+		 * compressed it to its current location/compression type
+		 */
+		extent_entry_drop(k, (union bch_extent_entry *) r);
+	}
+
+	return 0;
+}
+
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
+{
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 sub;
+
+	if (bkey_le(where, bkey_start_pos(k.k)))
+		return 0;
+
+	EBUG_ON(bkey_gt(where, k.k->p));
+
+	sub = where.offset - bkey_start_offset(k.k);
+
+	k.k->size -= sub;
+
+	if (!k.k->size) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v: {
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
+
+		bkey_extent_entry_for_each(ptrs, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += sub;
+				break;
+			case BCH_EXTENT_ENTRY_stripe_ptr:
+				break;
+			case BCH_EXTENT_ENTRY_rebalance:
+				break;
+			}
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
+		}
+
+		break;
+	}
+	case KEY_TYPE_reflink_p: {
+		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+		le64_add_cpu(&p.v->idx, sub);
+		break;
+	}
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data: {
+		void *p = bkey_inline_data_p(k);
+		unsigned bytes = bkey_inline_data_bytes(k.k);
+
+		sub = min_t(u64, sub << 9, bytes);
+
+		memmove(p, p + sub, bytes - sub);
+
+		new_val_u64s -= sub >> 3;
+		break;
+	}
+	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
+}
+
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
+{
+	unsigned new_val_u64s = bkey_val_u64s(k.k);
+	int val_u64s_delta;
+	u64 len = 0;
+
+	if (bkey_ge(where, k.k->p))
+		return 0;
+
+	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
+
+	len = where.offset - bkey_start_offset(k.k);
+
+	k.k->p.offset = where.offset;
+	k.k->size = len;
+
+	if (!len) {
+		k.k->type = KEY_TYPE_deleted;
+		new_val_u64s = 0;
+	}
+
+	switch (k.k->type) {
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+		new_val_u64s = (bkey_inline_data_offset(k.k) +
+				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
+		break;
+	}
+
+	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+	BUG_ON(val_u64s_delta < 0);
+
+	set_bkey_val_u64s(k.k, new_val_u64s);
+	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+	return -val_u64s_delta;
+}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
new file mode 100644
index 000000000000..a2ce8a3be13c
--- /dev/null
+++ b/fs/bcachefs/extents.h
@@ -0,0 +1,765 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct btree_trans;
+enum bkey_invalid_flags;
+
+/* extent entries: */
+
+#define extent_entry_last(_e)						\
+	((typeof(&(_e).v->start[0])) bkey_val_end(_e))
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *) &&	\
+		     !type_is(_entry, struct bch_extent_stripe_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *) ||\
+		 type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+#define x(f, n)						\
+	case BCH_EXTENT_ENTRY_##f:			\
+		return sizeof(struct bch_extent_##f);
+	BCH_EXTENT_ENTRY_TYPES()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline void __extent_entry_insert(struct bkey_i *k,
+					 union bch_extent_entry *dst,
+					 union bch_extent_entry *new)
+{
+	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+
+	memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+			      dst, (u64 *) end - (u64 *) dst);
+	k->k.u64s += extent_entry_u64s(new);
+	memcpy_u64s_small(dst, new, extent_entry_u64s(new));
+}
+
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	/* stripes have ptrs, but their layout doesn't work with this code */
+	BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+	memmove_u64s_down(entry, next,
+			  (u64 *) bkey_val_end(k) - (u64 *) next);
+	k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	switch (extent_entry_type(e)) {
+	case BCH_EXTENT_ENTRY_crc32:
+	case BCH_EXTENT_ENTRY_crc64:
+	case BCH_EXTENT_ENTRY_crc128:
+		return true;
+	default:
+		return false;
+	}
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+};
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	if (!crc)
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+		};
+
+		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+		};
+
+		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_ENTRY_crc128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+	return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+		crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+	return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
+/* bkey_ptrs: generically over any key type that has ptrs */
+
+struct bkey_ptrs_c {
+	const union bch_extent_entry	*start;
+	const union bch_extent_entry	*end;
+};
+
+struct bkey_ptrs {
+	union bch_extent_entry	*start;
+	union bch_extent_entry	*end;
+};
+
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr: {
+		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+
+		return (struct bkey_ptrs_c) {
+			e.v->start,
+			extent_entry_last(e)
+		};
+	}
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&s.v->ptrs[0]),
+			to_entry(&s.v->ptrs[s.v->nr_blocks]),
+		};
+	}
+	case KEY_TYPE_reflink_v: {
+		struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+		return (struct bkey_ptrs_c) {
+			r.v->start,
+			bkey_val_end(r),
+		};
+	}
+	case KEY_TYPE_btree_ptr_v2: {
+		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+
+		return (struct bkey_ptrs_c) {
+			to_entry(&e.v->start[0]),
+			to_entry(extent_entry_last(e))
+		};
+	}
+	default:
+		return (struct bkey_ptrs_c) { NULL, NULL };
+	}
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+	return (struct bkey_ptrs) {
+		(void *) p.start,
+		(void *) p.end
+	};
+}
+
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
+	for ((_entry) = (_start);					\
+	     (_entry) < (_end);						\
+	     (_entry) = extent_entry_next(_entry))
+
+#define __bkey_ptr_next(_ptr, _end)					\
+({									\
+	typeof(_end) _entry;						\
+									\
+	__bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry)	\
+		if (extent_entry_is_ptr(_entry))			\
+			break;						\
+									\
+	_entry < (_end) ? entry_to_ptr(_entry) : NULL;			\
+})
+
+#define bkey_extent_entry_for_each_from(_p, _entry, _start)		\
+	__bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
+
+#define bkey_extent_entry_for_each(_p, _entry)				\
+	bkey_extent_entry_for_each_from(_p, _entry, _p.start)
+
+#define __bkey_for_each_ptr(_start, _end, _ptr)				\
+	for ((_ptr) = (_start);						\
+	     ((_ptr) = __bkey_ptr_next(_ptr, _end));			\
+	     (_ptr)++)
+
+#define bkey_ptr_next(_p, _ptr)						\
+	__bkey_ptr_next(_ptr, (_p).end)
+
+#define bkey_for_each_ptr(_p, _ptr)					\
+	__bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
+
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry)			\
+({									\
+	__label__ out;							\
+									\
+	(_ptr).idx	= 0;						\
+	(_ptr).has_ec	= false;					\
+									\
+	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
+		switch (extent_entry_type(_entry)) {			\
+		case BCH_EXTENT_ENTRY_ptr:				\
+			(_ptr).ptr		= _entry->ptr;		\
+			goto out;					\
+		case BCH_EXTENT_ENTRY_crc32:				\
+		case BCH_EXTENT_ENTRY_crc64:				\
+		case BCH_EXTENT_ENTRY_crc128:				\
+			(_ptr).crc = bch2_extent_crc_unpack(_k,		\
+					entry_to_crc(_entry));		\
+			break;						\
+		case BCH_EXTENT_ENTRY_stripe_ptr:			\
+			(_ptr).ec = _entry->stripe_ptr;			\
+			(_ptr).has_ec	= true;				\
+			break;						\
+		default:						\
+			/* nothing */					\
+			break;						\
+		}							\
+out:									\
+	_entry < (_end);						\
+})
+
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry)	\
+	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
+	     (_entry) = _start;						\
+	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
+	     (_entry) = extent_entry_next(_entry))
+
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
+				   _ptr, _entry)
+
+#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
+({									\
+	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack(_k,		\
+						entry_to_crc(_iter));	\
+			break;						\
+		}							\
+									\
+	(_iter) < (_end);						\
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
+	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
+	     (_iter) = (_start);					\
+	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
+	     (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)				\
+	__bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
+/* Iterate over pointers in KEY_TYPE_extent: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	__bkey_extent_entry_for_each_from(_start,			\
+				extent_entry_last(_e), _entry)
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+#define extent_ptr_next(_e, _ptr)					\
+	__bkey_ptr_next(_ptr, extent_entry_last(_e))
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)			\
+	__bkey_for_each_ptr_decode((_e).k, (_e).v->start,		\
+				   extent_entry_last(_e), _ptr, _entry)
+
+/* utility code common to all keys with pointers: */
+
+void bch2_mark_io_failure(struct bch_io_failures *,
+			  struct extent_ptr_decoded *);
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
+			       struct bch_io_failures *,
+			       struct extent_ptr_decoded *);
+
+/* KEY_TYPE_btree_ptr: */
+
+int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+
+int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
+			      enum bkey_invalid_flags, struct printbuf *);
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
+			      int, struct bkey_s);
+
+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+})
+
+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
+	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
+	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.compat		= bch2_btree_ptr_v2_compat,		\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 40,					\
+})
+
+/* KEY_TYPE_extent: */
+
+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
+	.key_invalid	= bch2_bkey_ptrs_invalid,		\
+	.val_to_text	= bch2_bkey_ptrs_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_extent_normalize,		\
+	.key_merge	= bch2_extent_merge,			\
+	.trans_trigger	= bch2_trans_mark_extent,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+})
+
+/* KEY_TYPE_reservation: */
+
+int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
+			     enum bkey_invalid_flags, struct printbuf *);
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reservation_invalid,		\
+	.val_to_text	= bch2_reservation_to_text,		\
+	.key_merge	= bch2_reservation_merge,		\
+	.trans_trigger	= bch2_trans_mark_reservation,		\
+	.atomic_trigger	= bch2_mark_reservation,		\
+	.min_val_size	= 8,					\
+})
+
+/* Extent checksum entries: */
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+				 struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+			    struct bch_extent_crc_unpacked);
+
+/* Generic code for keys with pointers: */
+
+static inline bool bkey_is_btree_ptr(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inline_data ||
+		k->type == KEY_TYPE_indirect_inline_data;
+}
+
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_inline_data:
+		return sizeof(struct bch_inline_data);
+	case KEY_TYPE_indirect_inline_data:
+		return sizeof(struct bch_indirect_inline_data);
+	default:
+		BUG();
+	}
+}
+
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
+{
+	return bkey_val_bytes(k) - bkey_inline_data_offset(k);
+}
+
+#define bkey_inline_data_p(_k)	(((void *) (_k).v) + bkey_inline_data_offset((_k).k))
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	return  bkey_extent_is_direct_data(k) ||
+		bkey_extent_is_inline_data(k) ||
+		k->type == KEY_TYPE_reflink_p;
+}
+
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reservation:
+	case KEY_TYPE_reflink_p:
+	case KEY_TYPE_reflink_v:
+	case KEY_TYPE_inline_data:
+	case KEY_TYPE_indirect_inline_data:
+	case KEY_TYPE_error:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(ptrs, ptr)
+		if (ptr->unwritten)
+			return true;
+	return false;
+}
+
+static inline bool bkey_extent_is_reservation(struct bkey_s_c k)
+{
+	return k.k->type == KEY_TYPE_reservation ||
+		bkey_extent_is_unwritten(k);
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+
+	bkey_for_each_ptr(p, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return BCH_DATA_btree;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return BCH_DATA_user;
+	case KEY_TYPE_stripe: {
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		BUG_ON(ptr < s.v->ptrs ||
+		       ptr >= s.v->ptrs + s.v->nr_blocks);
+
+		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+			? BCH_DATA_parity
+			: BCH_DATA_user;
+	}
+	default:
+		BUG();
+	}
+}
+
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+	return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
+
+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
+{
+	struct bch_extent_ptr *dest;
+
+	EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
+
+	switch (k->k.type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+	case KEY_TYPE_extent:
+		EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
+
+		ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+		dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+		*dest = ptr;
+		k->k.u64s++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+				    struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+						   struct bch_extent_ptr *);
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
+					   struct bch_extent_ptr *);
+
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond)				\
+do {									\
+	struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k);			\
+									\
+	_ptr = &_ptrs.start->ptr;					\
+									\
+	while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) {			\
+		if (_cond) {						\
+			_ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr);	\
+			_ptrs = bch2_bkey_ptrs(_k);			\
+			continue;					\
+		}							\
+									\
+		(_ptr)++;						\
+	}								\
+} while (0)
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+			   struct bch_extent_ptr, u64);
+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
+
+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+
+void bch2_ptr_swab(struct bkey_s);
+
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+				       unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+				  unsigned, unsigned);
+
+/* Generic extent code: */
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							  const struct bkey *m)
+{
+	int cmp1 = bkey_lt(k->p, m->p);
+	int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m));
+
+	return (cmp1 << 1) + cmp2;
+}
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
+
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_front_s(where, bkey_i_to_s(k));
+}
+
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+	bch2_cut_back_s(where, bkey_i_to_s(k));
+}
+
+/**
+ * bch_key_resize - adjust size of @k
+ *
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
+ */
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
+{
+	k->p.offset -= k->size;
+	k->p.offset += new_size;
+	k->size = new_size;
+}
+
+#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
new file mode 100644
index 000000000000..43d6c341ecca
--- /dev/null
+++ b/fs/bcachefs/extents_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u32			compressed_size;
+	u32			uncompressed_size;
+	u32			live_size;
+
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			offset;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_ptr_decoded {
+	unsigned			idx;
+	bool				has_ec;
+	struct bch_extent_crc_unpacked	crc;
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_stripe_ptr	ec;
+};
+
+struct bch_io_failures {
+	u8			nr;
+	struct bch_dev_io_failures {
+		u8		dev;
+		u8		idx;
+		u8		nr_failed;
+		u8		nr_retries;
+	}			devs[BCH_REPLICAS_MAX];
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
new file mode 100644
index 000000000000..05429c9631cd
--- /dev/null
+++ b/fs/bcachefs/eytzinger.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+	return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+	return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+	return rounddown_pow_of_two(size);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+	return rounddown_pow_of_two(size + 1) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+	EBUG_ON(i > size);
+
+	if (eytzinger1_right_child(i) <= size) {
+		i = eytzinger1_right_child(i);
+
+		i <<= __fls(size + 1) - __fls(i);
+		i >>= i > size;
+	} else {
+		i >>= ffz(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+	EBUG_ON(i > size);
+
+	if (eytzinger1_left_child(i) <= size) {
+		i = eytzinger1_left_child(i) + 1;
+
+		i <<= __fls(size + 1) - __fls(i);
+		i -= 1;
+		i >>= i > size;
+	} else {
+		i >>= __ffs(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+	return (size + 1 - rounddown_pow_of_two(size)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(i);
+	unsigned shift = __fls(size) - b;
+	int s;
+
+	EBUG_ON(!i || i > size);
+
+	i  ^= 1U << b;
+	i <<= 1;
+	i  |= 1;
+	i <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i -= (i - extra) >> 1;
+	 */
+	s = extra - i;
+	i += (s >> 1) & (s >> 31);
+
+	return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!i || i > size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i += i - extra;
+	 */
+	s = extra - i;
+	i -= s & (s >> 31);
+
+	shift = __ffs(i);
+
+	i >>= shift + 1;
+	i  |= 1U << (__fls(size) - shift);
+
+	return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)			\
+	for ((_i) = eytzinger1_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+	return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+	return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+	return eytzinger1_first(size) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+	return eytzinger1_last(size) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+	return eytzinger1_next(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+	return eytzinger1_prev(i + 1, size) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+	return eytzinger1_extra(size);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base	= (base);					\
+	void *_search	= (search);					\
+	size_t _nr	= (nr);						\
+	size_t _size	= (size);					\
+	size_t _i	= 0;						\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
+
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
new file mode 100644
index 000000000000..66b945be10c2
--- /dev/null
+++ b/fs/bcachefs/fifo.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)							\
+struct {								\
+	size_t front, back, size, mask;					\
+	type *data;							\
+}
+
+#define DECLARE_FIFO(type, name)	FIFO(type) name
+
+#define fifo_buf_size(fifo)						\
+	((fifo)->size							\
+	 ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0])	\
+	 : 0)
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->size	= (_size);					\
+	(fifo)->mask	= (fifo)->size					\
+		? roundup_pow_of_two((fifo)->size) - 1			\
+		: 0;							\
+	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)					\
+	((((p) >= &fifo_peek_front(fifo)				\
+	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
+	   (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
+
+#define fifo_push_back_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_push_front(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (typecheck(typeof((_fifo)->front), _iter),			\
+	     (_iter) = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     (_iter)++)
+
+#endif /* _BCACHEFS_FIFO_H */
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
new file mode 100644
index 000000000000..4496cf91a4c1
--- /dev/null
+++ b/fs/bcachefs/fs-common.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "btree_update.h"
+#include "dirent.h"
+#include "fs-common.h"
+#include "inode.h"
+#include "subvolume.h"
+#include "xattr.h"
+
+#include <linux/posix_acl.h>
+
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+	return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *new_inode,
+		      const struct qstr *name,
+		      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		      struct posix_acl *default_acl,
+		      struct posix_acl *acl,
+		      subvol_inum snapshot_src,
+		      unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	subvol_inum new_inum = dir;
+	u64 now = bch2_current_time(c);
+	u64 cpu = raw_smp_processor_id();
+	u64 dir_target;
+	u32 snapshot;
+	unsigned dir_type = mode_to_type(mode);
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		/* Normal create path - allocate a new inode: */
+		bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+		if (flags & BCH_CREATE_TMPFILE)
+			new_inode->bi_flags |= BCH_INODE_unlinked;
+
+		ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
+		if (ret)
+			goto err;
+
+		snapshot_src = (subvol_inum) { 0 };
+	} else {
+		/*
+		 * Creating a snapshot - we're not allocating a new inode, but
+		 * we do have to lookup the root inode of the subvolume we're
+		 * snapshotting and update it (in the new snapshot):
+		 */
+
+		if (!snapshot_src.inum) {
+			/* Inode wasn't specified, just snapshot: */
+			struct bch_subvolume s;
+
+			ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+						 BTREE_ITER_CACHED, &s);
+			if (ret)
+				goto err;
+
+			snapshot_src.inum = le64_to_cpu(s.inode);
+		}
+
+		ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		if (new_inode->bi_subvol != snapshot_src.subvol) {
+			/* Not a subvolume root: */
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * If we're not root, we have to own the subvolume being
+		 * snapshotted:
+		 */
+		if (uid && new_inode->bi_uid != uid) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		flags |= BCH_CREATE_SUBVOL;
+	}
+
+	new_inum.inum	= new_inode->bi_inum;
+	dir_target	= new_inode->bi_inum;
+
+	if (flags & BCH_CREATE_SUBVOL) {
+		u32 new_subvol, dir_snapshot;
+
+		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    snapshot_src.subvol,
+					    &new_subvol, &snapshot,
+					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+		if (ret)
+			goto err;
+
+		new_inode->bi_parent_subvol	= dir.subvol;
+		new_inode->bi_subvol		= new_subvol;
+		new_inum.subvol			= new_subvol;
+		dir_target			= new_subvol;
+		dir_type			= DT_SUBVOL;
+
+		ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+		ret = bch2_btree_iter_traverse(&dir_iter);
+		if (ret)
+			goto err;
+	}
+
+	if (!(flags & BCH_CREATE_SNAPSHOT)) {
+		if (default_acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 default_acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto err;
+		}
+
+		if (acl) {
+			ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+						 acl, ACL_TYPE_ACCESS);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
+		u64 dir_offset;
+
+		if (is_subdir_for_nlink(new_inode))
+			dir_u->bi_nlink++;
+		dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+		ret = bch2_inode_write(trans, &dir_iter, dir_u);
+		if (ret)
+			goto err;
+
+		ret = bch2_dirent_create(trans, dir, &dir_hash,
+					 dir_type,
+					 name,
+					 dir_target,
+					 &dir_offset,
+					 BCH_HASH_SET_MUST_CREATE);
+		if (ret)
+			goto err;
+
+		if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+			new_inode->bi_dir		= dir_u->bi_inum;
+			new_inode->bi_dir_offset	= dir_offset;
+		}
+	}
+
+	inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+	bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+
+	ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+		bch2_inode_write(trans, &inode_iter, new_inode);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+int bch2_link_trans(struct btree_trans *trans,
+		    subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+		    subvol_inum inum, struct bch_inode_unpacked *inode_u,
+		    const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	u64 now = bch2_current_time(c);
+	u64 dir_offset = 0;
+	int ret;
+
+	if (dir.subvol != inum.subvol)
+		return -EXDEV;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	inode_u->bi_ctime = now;
+	ret = bch2_inode_nlink_inc(inode_u);
+	if (ret)
+		return ret;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (bch2_reinherit_attrs(inode_u, dir_u)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = now;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = bch2_dirent_create(trans, dir, &dir_hash,
+				 mode_to_type(inode_u->bi_mode),
+				 name, inum.inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		goto err;
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		inode_u->bi_dir		= dir.inum;
+		inode_u->bi_dir_offset	= dir_offset;
+	}
+
+	ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &dir_iter);
+	bch2_trans_iter_exit(trans, &inode_iter);
+	return ret;
+}
+
+int bch2_unlink_trans(struct btree_trans *trans,
+		      subvol_inum dir,
+		      struct bch_inode_unpacked *dir_u,
+		      struct bch_inode_unpacked *inode_u,
+		      const struct qstr *name,
+		      bool deleting_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dir_iter = { NULL };
+	struct btree_iter dirent_iter = { NULL };
+	struct btree_iter inode_iter = { NULL };
+	struct bch_hash_info dir_hash;
+	subvol_inum inum;
+	u64 now = bch2_current_time(c);
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	dir_hash = bch2_hash_info_init(c, dir_u);
+
+	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+					 name, &inum, BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+		ret = bch2_empty_dir_trans(trans, inum);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_snapshot && !inode_u->bi_subvol) {
+		ret = -BCH_ERR_ENOENT_not_subvol;
+		goto err;
+	}
+
+	if (deleting_snapshot || inode_u->bi_subvol) {
+		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+
+		k = bch2_btree_iter_peek_slot(&dirent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		/*
+		 * If we're deleting a subvolume, we need to really delete the
+		 * dirent, not just emit a whiteout in the current snapshot:
+		 */
+		bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+		ret = bch2_btree_iter_traverse(&dirent_iter);
+		if (ret)
+			goto err;
+	} else {
+		bch2_inode_nlink_dec(trans, inode_u);
+	}
+
+	if (inode_u->bi_dir		== dirent_iter.pos.inode &&
+	    inode_u->bi_dir_offset	== dirent_iter.pos.offset) {
+		inode_u->bi_dir		= 0;
+		inode_u->bi_dir_offset	= 0;
+	}
+
+	dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
+	dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+	ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash, &dirent_iter,
+				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_inode_write(trans, &dir_iter, dir_u) ?:
+		bch2_inode_write(trans, &inode_iter, inode_u);
+err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	bch2_trans_iter_exit(trans, &dir_iter);
+	return ret;
+}
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
+			  struct bch_inode_unpacked *src_u)
+{
+	u64 src, dst;
+	unsigned id;
+	bool ret = false;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		/* Skip attributes that were explicitly set on this inode */
+		if (dst_u->bi_fields_set & (1 << id))
+			continue;
+
+		src = bch2_inode_opt_get(src_u, id);
+		dst = bch2_inode_opt_get(dst_u, id);
+
+		if (src == dst)
+			continue;
+
+		bch2_inode_opt_set(dst_u, id, src);
+		ret = true;
+	}
+
+	return ret;
+}
+
+int bch2_rename_trans(struct btree_trans *trans,
+		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
+		      struct bch_inode_unpacked *src_inode_u,
+		      struct bch_inode_unpacked *dst_inode_u,
+		      const struct qstr *src_name,
+		      const struct qstr *dst_name,
+		      enum bch_rename_mode mode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter src_dir_iter = { NULL };
+	struct btree_iter dst_dir_iter = { NULL };
+	struct btree_iter src_inode_iter = { NULL };
+	struct btree_iter dst_inode_iter = { NULL };
+	struct bch_hash_info src_hash, dst_hash;
+	subvol_inum src_inum, dst_inum;
+	u64 src_offset, dst_offset;
+	u64 now = bch2_current_time(c);
+	int ret;
+
+	ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	src_hash = bch2_hash_info_init(c, src_dir_u);
+
+	if (dst_dir.inum	!= src_dir.inum ||
+	    dst_dir.subvol	!= src_dir.subvol) {
+		ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+
+		dst_hash = bch2_hash_info_init(c, dst_dir_u);
+	} else {
+		dst_dir_u = src_dir_u;
+		dst_hash = src_hash;
+	}
+
+	ret = bch2_dirent_rename(trans,
+				 src_dir, &src_hash,
+				 dst_dir, &dst_hash,
+				 src_name, &src_inum, &src_offset,
+				 dst_name, &dst_inum, &dst_offset,
+				 mode);
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto err;
+
+	if (dst_inum.inum) {
+		ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+				      BTREE_ITER_INTENT);
+		if (ret)
+			goto err;
+	}
+
+	if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+		src_inode_u->bi_dir		= dst_dir_u->bi_inum;
+		src_inode_u->bi_dir_offset	= dst_offset;
+
+		if (mode == BCH_RENAME_EXCHANGE) {
+			dst_inode_u->bi_dir		= src_dir_u->bi_inum;
+			dst_inode_u->bi_dir_offset	= src_offset;
+		}
+
+		if (mode == BCH_RENAME_OVERWRITE &&
+		    dst_inode_u->bi_dir		== dst_dir_u->bi_inum &&
+		    dst_inode_u->bi_dir_offset	== src_offset) {
+			dst_inode_u->bi_dir		= 0;
+			dst_inode_u->bi_dir_offset	= 0;
+		}
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		if (S_ISDIR(src_inode_u->bi_mode) !=
+		    S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = -ENOTDIR;
+			goto err;
+		}
+
+		if (S_ISDIR(dst_inode_u->bi_mode) &&
+		    bch2_empty_dir_trans(trans, dst_inum)) {
+			ret = -ENOTEMPTY;
+			goto err;
+		}
+	}
+
+	if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+	    S_ISDIR(src_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+	    S_ISDIR(dst_inode_u->bi_mode)) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (is_subdir_for_nlink(src_inode_u)) {
+		src_dir_u->bi_nlink--;
+		dst_dir_u->bi_nlink++;
+	}
+
+	if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
+		dst_dir_u->bi_nlink--;
+		src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
+	}
+
+	if (mode == BCH_RENAME_OVERWRITE)
+		bch2_inode_nlink_dec(trans, dst_inode_u);
+
+	src_dir_u->bi_mtime		= now;
+	src_dir_u->bi_ctime		= now;
+
+	if (src_dir.inum != dst_dir.inum) {
+		dst_dir_u->bi_mtime	= now;
+		dst_dir_u->bi_ctime	= now;
+	}
+
+	src_inode_u->bi_ctime		= now;
+
+	if (dst_inum.inum)
+		dst_inode_u->bi_ctime	= now;
+
+	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+		(src_dir.inum != dst_dir.inum
+		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
+		 : 0) ?:
+		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+		(dst_inum.inum
+		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
+		 : 0);
+err:
+	bch2_trans_iter_exit(trans, &dst_inode_iter);
+	bch2_trans_iter_exit(trans, &src_inode_iter);
+	bch2_trans_iter_exit(trans, &dst_dir_iter);
+	bch2_trans_iter_exit(trans, &src_dir_iter);
+	return ret;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
new file mode 100644
index 000000000000..dde237859514
--- /dev/null
+++ b/fs/bcachefs/fs-common.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_COMMON_H
+#define _BCACHEFS_FS_COMMON_H
+
+struct posix_acl;
+
+#define BCH_CREATE_TMPFILE		(1U << 0)
+#define BCH_CREATE_SUBVOL		(1U << 1)
+#define BCH_CREATE_SNAPSHOT		(1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO		(1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      uid_t, gid_t, umode_t, dev_t,
+		      struct posix_acl *,
+		      struct posix_acl *,
+		      subvol_inum, unsigned);
+
+int bch2_link_trans(struct btree_trans *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    subvol_inum, struct bch_inode_unpacked *,
+		    const struct qstr *);
+
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *, bool);
+
+int bch2_rename_trans(struct btree_trans *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      subvol_inum, struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      struct bch_inode_unpacked *,
+		      const struct qstr *,
+		      const struct qstr *,
+		      enum bch_rename_mode);
+
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
+			  struct bch_inode_unpacked *);
+
+#endif /* _BCACHEFS_FS_COMMON_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
new file mode 100644
index 000000000000..52f0e7acda3d
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -0,0 +1,1106 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+	if (bio->bi_vcnt >= bio->bi_max_vecs)
+		return true;
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
+		return true;
+	return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+	struct folio_iter fi;
+
+	bio_for_each_folio_all(fi, bio) {
+		if (!bio->bi_status) {
+			folio_mark_uptodate(fi.folio);
+		} else {
+			folio_clear_uptodate(fi.folio);
+			folio_set_error(fi.folio);
+		}
+		folio_unlock(fi.folio);
+	}
+
+	bio_put(bio);
+}
+
+struct readpages_iter {
+	struct address_space	*mapping;
+	unsigned		idx;
+	folios			folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+			       struct readahead_control *ractl)
+{
+	struct folio **fi;
+	int ret;
+
+	memset(iter, 0, sizeof(*iter));
+
+	iter->mapping = ractl->mapping;
+
+	ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+				ractl->_index << PAGE_SHIFT,
+				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+				0, mapping_gfp_mask(iter->mapping),
+				&iter->folios);
+	if (ret)
+		return ret;
+
+	darray_for_each(iter->folios, fi) {
+		ractl->_nr_pages -= 1U << folio_order(*fi);
+		__bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+		folio_put(*fi);
+		folio_put(*fi);
+	}
+
+	return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+	if (iter->idx >= iter->folios.nr)
+		return NULL;
+	return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+	iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc.csum_type || crc.compression_type)
+			return true;
+	return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+			       struct readpages_iter *iter,
+			       struct bio *bio,
+			       unsigned sectors_this_extent,
+			       bool get_more)
+{
+	/* Don't hold btree locks while allocating memory: */
+	bch2_trans_unlock(trans);
+
+	while (bio_sectors(bio) < sectors_this_extent &&
+	       bio->bi_vcnt < bio->bi_max_vecs) {
+		struct folio *folio = readpage_iter_peek(iter);
+		int ret;
+
+		if (folio) {
+			readpage_iter_advance(iter);
+		} else {
+			pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+			if (!get_more)
+				break;
+
+			folio = xa_load(&iter->mapping->i_pages, folio_offset);
+			if (folio && !xa_is_value(folio))
+				break;
+
+			folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+			if (!folio)
+				break;
+
+			if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+				folio_put(folio);
+				break;
+			}
+
+			ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+			if (ret) {
+				__bch2_folio_release(folio);
+				folio_put(folio);
+				break;
+			}
+
+			folio_put(folio);
+		}
+
+		BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+		BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+	}
+
+	return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+		       struct bch_read_bio *rbio,
+		       subvol_inum inum,
+		       struct readpages_iter *readpages_iter)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	int flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE;
+	u32 snapshot;
+	int ret = 0;
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bkey_s_c k;
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		if (readpages_iter) {
+			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+						  extent_partial_reads_expensive(k));
+			if (ret)
+				break;
+		}
+
+		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		bch2_bio_page_state_set(&rbio->bio, k);
+
+		bch2_read_extent(trans, rbio, iter.pos,
+				 data_btree, k, offset_into_extent, flags);
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+				iter.pos.inode,
+				iter.pos.offset << 9,
+				"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bio_endio(&rbio->bio);
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+	struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct folio *folio;
+	struct readpages_iter readpages_iter;
+	int ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	ret = readpages_iter_init(&readpages_iter, ractl);
+	BUG_ON(ret);
+
+	bch2_pagecache_add_get(inode);
+
+	while ((folio = readpage_iter_peek(&readpages_iter))) {
+		unsigned n = min_t(unsigned,
+				   readpages_iter.folios.nr -
+				   readpages_iter.idx,
+				   BIO_MAX_VECS);
+		struct bch_read_bio *rbio =
+			rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+						   GFP_KERNEL, &c->bio_read),
+				  opts);
+
+		readpage_iter_advance(&readpages_iter);
+
+		rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+		rbio->bio.bi_end_io = bch2_readpages_end_io;
+		BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+		bchfs_read(trans, rbio, inode_inum(inode),
+			   &readpages_iter);
+		bch2_trans_unlock(trans);
+	}
+
+	bch2_pagecache_add_put(inode);
+
+	bch2_trans_put(trans);
+	darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum, struct folio *folio)
+{
+	bch2_folio_create(folio, __GFP_NOFAIL);
+
+	rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+	rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+	BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+	bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_read_bio *rbio;
+	struct bch_io_opts opts;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+			 opts);
+	rbio->bio.bi_private = &done;
+	rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+	__bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+	wait_for_completion(&done);
+
+	ret = blk_status_to_errno(rbio->bio.bi_status);
+	bio_put(&rbio->bio);
+
+	if (ret < 0)
+		return ret;
+
+	folio_mark_uptodate(folio);
+	return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+	int ret;
+
+	ret = bch2_read_single_folio(folio, folio->mapping);
+	folio_unlock(folio);
+	return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+	struct bch_inode_info		*inode;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+struct bch_writepage_state {
+	struct bch_writepage_io	*io;
+	struct bch_io_opts	opts;
+	struct bch_folio_sector	*tmp;
+	unsigned		tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+								  struct bch_inode_info *inode)
+{
+	struct bch_writepage_state ret = { 0 };
+
+	bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+	return ret;
+}
+
+/*
+ * Determine when a writepage io is full. We have to limit writepage bios to a
+ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
+ * what the bounce path in bch2_write_extent() can handle. In theory we could
+ * loosen this restriction for non-bounce I/O, but we don't have that context
+ * here. Ideally, we can up this limit and make it configurable in the future
+ * when the bounce path can be enhanced to accommodate larger source bios.
+ */
+static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
+{
+	struct bio *bio = &io->op.wbio.bio;
+	return bio_full(bio, len) ||
+		(bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+	struct bch_writepage_io *io =
+		container_of(op, struct bch_writepage_io, op);
+	struct bch_fs *c = io->op.c;
+	struct bio *bio = &io->op.wbio.bio;
+	struct folio_iter fi;
+	unsigned i;
+
+	if (io->op.error) {
+		set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			folio_set_error(fi.folio);
+			mapping_set_error(fi.folio->mapping, -EIO);
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+		bio_for_each_folio_all(fi, bio) {
+			struct bch_folio *s;
+
+			s = __bch2_folio(fi.folio);
+			spin_lock(&s->lock);
+			for (i = 0; i < folio_sectors(fi.folio); i++)
+				s->s[i].nr_replicas = 0;
+			spin_unlock(&s->lock);
+		}
+	}
+
+	/*
+	 * racing with fallocate can cause us to add fewer sectors than
+	 * expected - but we shouldn't add more sectors than expected:
+	 */
+	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+	/*
+	 * (error (due to going RO) halfway through a page can screw that up
+	 * slightly)
+	 * XXX wtf?
+	   BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+	 */
+
+	/*
+	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
+	 * before calling end_page_writeback:
+	 */
+	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+	bio_for_each_folio_all(fi, bio) {
+		struct bch_folio *s = __bch2_folio(fi.folio);
+
+		if (atomic_dec_and_test(&s->write_count))
+			folio_end_writeback(fi.folio);
+	}
+
+	bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+	struct bch_writepage_io *io = w->io;
+
+	w->io = NULL;
+	closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+				    struct writeback_control *wbc,
+				    struct bch_writepage_state *w,
+				    struct bch_inode_info *inode,
+				    u64 sector,
+				    unsigned nr_replicas)
+{
+	struct bch_write_op *op;
+
+	w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+					      REQ_OP_WRITE,
+					      GFP_KERNEL,
+					      &c->writepage_bioset),
+			     struct bch_writepage_io, op.wbio.bio);
+
+	w->io->inode		= inode;
+	op			= &w->io->op;
+	bch2_write_op_init(op, c, w->opts);
+	op->target		= w->opts.foreground_target;
+	op->nr_replicas		= nr_replicas;
+	op->res.nr_replicas	= nr_replicas;
+	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
+	op->subvol		= inode->ei_subvol;
+	op->pos			= POS(inode->v.i_ino, sector);
+	op->end_io		= bch2_writepage_io_done;
+	op->devs_need_flush	= &inode->ei_devs_need_flush;
+	op->wbio.bio.bi_iter.bi_sector = sector;
+	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+			    struct writeback_control *wbc,
+			    void *data)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_writepage_state *w = data;
+	struct bch_folio *s;
+	unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+	loff_t i_size = i_size_read(&inode->v);
+	int ret;
+
+	EBUG_ON(!folio_test_uptodate(folio));
+
+	/* Is the folio fully inside i_size? */
+	if (folio_end_pos(folio) <= i_size)
+		goto do_io;
+
+	/* Is the folio fully outside i_size? (truncate in progress) */
+	if (folio_pos(folio) >= i_size) {
+		folio_unlock(folio);
+		return 0;
+	}
+
+	/*
+	 * The folio straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the folio size.  For a file that is not a multiple of
+	 * the  folio size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	folio_zero_segment(folio,
+			   i_size - folio_pos(folio),
+			   folio_size(folio));
+do_io:
+	f_sectors = folio_sectors(folio);
+	s = bch2_folio(folio);
+
+	if (f_sectors > w->tmp_sectors) {
+		kfree(w->tmp);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+		w->tmp_sectors = f_sectors;
+	}
+
+	/*
+	 * Things get really hairy with errors during writeback:
+	 */
+	ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+	BUG_ON(ret);
+
+	/* Before unlocking the page, get copy of reservations: */
+	spin_lock(&s->lock);
+	memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		nr_replicas_this_write =
+			min_t(unsigned, nr_replicas_this_write,
+			      s->s[i].nr_replicas +
+			      s->s[i].replicas_reserved);
+	}
+
+	for (i = 0; i < f_sectors; i++) {
+		if (s->s[i].state < SECTOR_dirty)
+			continue;
+
+		s->s[i].nr_replicas = w->opts.compression
+			? 0 : nr_replicas_this_write;
+
+		s->s[i].replicas_reserved = 0;
+		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+	}
+	spin_unlock(&s->lock);
+
+	BUG_ON(atomic_read(&s->write_count));
+	atomic_set(&s->write_count, 1);
+
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+
+	folio_unlock(folio);
+
+	offset = 0;
+	while (1) {
+		unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+		u64 sector;
+
+		while (offset < f_sectors &&
+		       w->tmp[offset].state < SECTOR_dirty)
+			offset++;
+
+		if (offset == f_sectors)
+			break;
+
+		while (offset + sectors < f_sectors &&
+		       w->tmp[offset + sectors].state >= SECTOR_dirty) {
+			reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+			dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+			sectors++;
+		}
+		BUG_ON(!sectors);
+
+		sector = folio_sector(folio) + offset;
+
+		if (w->io &&
+		    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+		     bch_io_full(w->io, sectors << 9) ||
+		     bio_end_sector(&w->io->op.wbio.bio) != sector))
+			bch2_writepage_do_io(w);
+
+		if (!w->io)
+			bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+						nr_replicas_this_write);
+
+		atomic_inc(&s->write_count);
+
+		BUG_ON(inode != w->io->inode);
+		BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+				     sectors << 9, offset << 9));
+
+		/* Check for writing past i_size: */
+		WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+			  round_up(i_size, block_bytes(c)) &&
+			  !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+			  "writing past i_size: %llu > %llu (unrounded %llu)\n",
+			  bio_end_sector(&w->io->op.wbio.bio) << 9,
+			  round_up(i_size, block_bytes(c)),
+			  i_size);
+
+		w->io->op.res.sectors += reserved_sectors;
+		w->io->op.i_sectors_delta -= dirty_sectors;
+		w->io->op.new_i_size = i_size;
+
+		offset += sectors;
+	}
+
+	if (atomic_dec_and_test(&s->write_count))
+		folio_end_writeback(folio);
+
+	return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+	struct bch_writepage_state w =
+		bch_writepage_state_init(c, to_bch_ei(mapping->host));
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+	if (w.io)
+		bch2_writepage_do_io(&w);
+	blk_finish_plug(&plug);
+	kfree(w.tmp);
+	return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+		     loff_t pos, unsigned len,
+		     struct page **pagep, void **fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res;
+	struct folio *folio;
+	unsigned offset;
+	int ret = -ENOMEM;
+
+	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	bch2_folio_reservation_init(c, inode, res);
+	*fsdata = res;
+
+	bch2_pagecache_add_get(inode);
+
+	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+				FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+				mapping_gfp_mask(mapping));
+	if (IS_ERR_OR_NULL(folio))
+		goto err_unlock;
+
+	offset = pos - folio_pos(folio);
+	len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+	if (folio_test_uptodate(folio))
+		goto out;
+
+	/* If we're writing entire folio, don't need to read it in first: */
+	if (!offset && len == folio_size(folio))
+		goto out;
+
+	if (!offset && pos + len >= inode->v.i_size) {
+		folio_zero_segment(folio, len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+
+	if (folio_pos(folio) >= inode->v.i_size) {
+		folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+		flush_dcache_folio(folio);
+		goto out;
+	}
+readpage:
+	ret = bch2_read_single_folio(folio, mapping);
+	if (ret)
+		goto err;
+out:
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+	if (ret) {
+		if (!folio_test_uptodate(folio)) {
+			/*
+			 * If the folio hasn't been read in, we won't know if we
+			 * actually need a reservation - we don't actually need
+			 * to read here, we just need to check if the folio is
+			 * fully backed by uncompressed data:
+			 */
+			goto readpage;
+		}
+
+		goto err;
+	}
+
+	*pagep = &folio->page;
+	return 0;
+err:
+	folio_unlock(folio);
+	folio_put(folio);
+	*pagep = NULL;
+err_unlock:
+	bch2_pagecache_add_put(inode);
+	kfree(res);
+	*fsdata = NULL;
+	return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+		   loff_t pos, unsigned len, unsigned copied,
+		   struct page *page, void *fsdata)
+{
+	struct bch_inode_info *inode = to_bch_ei(mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation *res = fsdata;
+	struct folio *folio = page_folio(page);
+	unsigned offset = pos - folio_pos(folio);
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+	BUG_ON(offset + copied > folio_size(folio));
+
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+		/*
+		 * The folio needs to be read in, but that would destroy
+		 * our partial write - simplest thing is to just force
+		 * userspace to redo the write:
+		 */
+		folio_zero_range(folio, 0, folio_size(folio));
+		flush_dcache_folio(folio);
+		copied = 0;
+	}
+
+	spin_lock(&inode->v.i_lock);
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+	spin_unlock(&inode->v.i_lock);
+
+	if (copied) {
+		if (!folio_test_uptodate(folio))
+			folio_mark_uptodate(folio);
+
+		bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+		inode->ei_last_dirtied = (unsigned long) current;
+	}
+
+	folio_unlock(folio);
+	folio_put(folio);
+	bch2_pagecache_add_put(inode);
+
+	bch2_folio_reservation_put(c, inode, res);
+	kfree(res);
+
+	return copied;
+}
+
+static noinline void folios_trunc(folios *fs, struct folio **fi)
+{
+	while (fs->data + fs->nr > fi) {
+		struct folio *f = darray_pop(fs);
+
+		folio_unlock(f);
+		folio_put(f);
+	}
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	folios fs;
+	struct folio **fi, *f;
+	unsigned copied = 0, f_offset, f_copied;
+	u64 end = pos + len, f_pos, f_len;
+	loff_t last_folio_pos = inode->v.i_size;
+	int ret = 0;
+
+	BUG_ON(!len);
+
+	bch2_folio_reservation_init(c, inode, &res);
+	darray_init(&fs);
+
+	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+				   mapping_gfp_mask(mapping),
+				   &fs);
+	if (ret)
+		goto out;
+
+	BUG_ON(!fs.nr);
+
+	f = darray_first(fs);
+	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+		ret = bch2_read_single_folio(f, mapping);
+		if (ret)
+			goto out;
+	}
+
+	f = darray_last(fs);
+	end = min(end, folio_end_pos(f));
+	last_folio_pos = folio_pos(f);
+	if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+		if (end >= inode->v.i_size) {
+			folio_zero_range(f, 0, folio_size(f));
+		} else {
+			ret = bch2_read_single_folio(f, mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
+	if (ret)
+		goto out;
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		/*
+		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+		 * supposed to write as much as we have disk space for.
+		 *
+		 * On failure here we should still write out a partial page if
+		 * we aren't completely out of disk space - we don't do that
+		 * yet:
+		 */
+		ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+		if (unlikely(ret)) {
+			folios_trunc(&fs, fi);
+			if (!fs.nr)
+				goto out;
+
+			end = min(end, folio_end_pos(darray_last(fs)));
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		darray_for_each(fs, fi)
+			flush_dcache_folio(*fi);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+		f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+		if (!f_copied) {
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		if (!folio_test_uptodate(f) &&
+		    f_copied != folio_size(f) &&
+		    pos + copied + f_copied < inode->v.i_size) {
+			iov_iter_revert(iter, f_copied);
+			folio_zero_range(f, 0, folio_size(f));
+			folios_trunc(&fs, fi);
+			break;
+		}
+
+		flush_dcache_folio(f);
+		copied += f_copied;
+
+		if (f_copied != f_len) {
+			folios_trunc(&fs, fi + 1);
+			break;
+		}
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	if (!copied)
+		goto out;
+
+	end = pos + copied;
+
+	spin_lock(&inode->v.i_lock);
+	if (end > inode->v.i_size)
+		i_size_write(&inode->v, end);
+	spin_unlock(&inode->v.i_lock);
+
+	f_pos = pos;
+	f_offset = pos - folio_pos(darray_first(fs));
+	darray_for_each(fs, fi) {
+		f = *fi;
+		f_len = min(end, folio_end_pos(f)) - f_pos;
+
+		if (!folio_test_uptodate(f))
+			folio_mark_uptodate(f);
+
+		bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+		f_pos = folio_end_pos(f);
+		f_offset = 0;
+	}
+
+	inode->ei_last_dirtied = (unsigned long) current;
+out:
+	darray_for_each(fs, fi) {
+		folio_unlock(*fi);
+		folio_put(*fi);
+	}
+
+	/*
+	 * If the last folio added to the mapping starts beyond current EOF, we
+	 * performed a short write but left around at least one post-EOF folio.
+	 * Clean up the mapping before we return.
+	 */
+	if (last_folio_pos >= inode->v.i_size)
+		truncate_pagecache(&inode->v, inode->v.i_size);
+
+	darray_exit(&fs);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	bch2_pagecache_add_get(inode);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = iov_iter_count(iter);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+		ret = 0;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	bch2_pagecache_add_put(inode);
+
+	return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
new file mode 100644
index 000000000000..a6126ff790e6
--- /dev/null
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
new file mode 100644
index 000000000000..84e20c3ada6c
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.c
@@ -0,0 +1,677 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/prefetch.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	bool				should_dirty;
+	struct bch_read_bio		rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+	if (check_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
+{
+	closure_type(dio, struct dio_read, cl);
+
+	dio->req->ki_complete(dio->req, dio->ret);
+	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		dio->ret = blk_status_to_errno(bio->bi_status);
+
+	closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+	struct dio_read *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	bch2_direct_IO_read_endio(bio);
+	bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_io_opts opts;
+	struct dio_read *dio;
+	struct bio *bio;
+	loff_t offset = req->ki_pos;
+	bool sync = is_sync_kiocb(req);
+	size_t shorten;
+	ssize_t ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	if ((offset|iter->count) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	ret = min_t(loff_t, iter->count,
+		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+	if (!ret)
+		return ret;
+
+	shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+	iter->count -= shorten;
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_READ,
+			       GFP_KERNEL,
+			       &c->dio_read_bioset);
+
+	bio->bi_end_io = bch2_direct_IO_read_endio;
+
+	dio = container_of(bio, struct dio_read, rbio.bio);
+	closure_init(&dio->cl, NULL);
+
+	/*
+	 * this is a _really_ horrible hack just to avoid an atomic sub at the
+	 * end:
+	 */
+	if (!sync) {
+		set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER -
+			   CLOSURE_RUNNING +
+			   CLOSURE_DESTRUCTOR);
+	} else {
+		atomic_set(&dio->cl.remaining,
+			   CLOSURE_REMAINING_INITIALIZER + 1);
+		dio->cl.closure_get_happened = true;
+	}
+
+	dio->req	= req;
+	dio->ret	= ret;
+	/*
+	 * This is one of the sketchier things I've encountered: we have to skip
+	 * the dirtying of requests that are internal from the kernel (i.e. from
+	 * loopback), because we'll deadlock on page_lock.
+	 */
+	dio->should_dirty = iter_is_iovec(iter);
+
+	goto start;
+	while (iter->count) {
+		bio = bio_alloc_bioset(NULL,
+				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+				       REQ_OP_READ,
+				       GFP_KERNEL,
+				       &c->bio_read);
+		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
+start:
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC;
+		bio->bi_iter.bi_sector	= offset >> 9;
+		bio->bi_private		= dio;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (ret < 0) {
+			/* XXX: fault inject this path */
+			bio->bi_status = BLK_STS_RESOURCE;
+			bio_endio(bio);
+			break;
+		}
+
+		offset += bio->bi_iter.bi_size;
+
+		if (dio->should_dirty)
+			bio_set_pages_dirty(bio);
+
+		if (iter->count)
+			closure_get(&dio->cl);
+
+		bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+	}
+
+	iter->count += shorten;
+
+	if (sync) {
+		closure_sync(&dio->cl);
+		closure_debug_destroy(&dio->cl);
+		ret = dio->ret;
+		bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+		return ret;
+	} else {
+		return -EIOCBQUEUED;
+	}
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	if (!count)
+		return 0; /* skip atime */
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		struct blk_plug plug;
+
+		if (unlikely(mapping->nrpages)) {
+			ret = filemap_write_and_wait_range(mapping,
+						iocb->ki_pos,
+						iocb->ki_pos + count - 1);
+			if (ret < 0)
+				goto out;
+		}
+
+		file_accessed(file);
+
+		blk_start_plug(&plug);
+		ret = bch2_direct_IO_read(iocb, iter);
+		blk_finish_plug(&plug);
+
+		if (ret >= 0)
+			iocb->ki_pos += ret;
+	} else {
+		bch2_pagecache_add_get(inode);
+		ret = generic_file_read_iter(iocb, iter);
+		bch2_pagecache_add_put(inode);
+	}
+out:
+	return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+	struct kiocb			*req;
+	struct address_space		*mapping;
+	struct bch_inode_info		*inode;
+	struct mm_struct		*mm;
+	const struct iovec		*iov;
+	unsigned			loop:1,
+					extending:1,
+					sync:1,
+					flush:1;
+	struct quota_res		quota_res;
+	u64				written;
+
+	struct iov_iter			iter;
+	struct iovec			inline_vecs[2];
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+				       u64 offset, u64 size,
+				       unsigned nr_replicas, bool compressed)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 end = offset + size;
+	u32 snapshot;
+	bool ret = true;
+	int err;
+retry:
+	bch2_trans_begin(trans);
+
+	err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (err)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, err) {
+		if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+			break;
+
+		if (k.k->p.snapshot != snapshot ||
+		    nr_replicas > bch2_bkey_replicas(c, k) ||
+		    (!compressed && bch2_bkey_sectors_compressed(k))) {
+			ret = false;
+			break;
+		}
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_put(trans);
+
+	return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	return bch2_check_range_allocated(c, inode_inum(inode),
+				dio->op.pos.offset, bio_sectors(bio),
+				dio->op.opts.data_replicas,
+				dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+	struct iovec *iov = dio->inline_vecs;
+
+	/*
+	 * iov_iter has a single embedded iovec - nothing to do:
+	 */
+	if (iter_is_ubuf(&dio->iter))
+		return 0;
+
+	/*
+	 * We don't currently handle non-iovec iov_iters here - return an error,
+	 * and we'll fall back to doing the IO synchronously:
+	 */
+	if (!iter_is_iovec(&dio->iter))
+		return -1;
+
+	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+		dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+				    GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+	}
+
+	memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+	dio->iter.__iov = iov;
+	return 0;
+}
+
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
+{
+	closure_type(dio, struct dio_write, op.cl);
+	struct bch_fs *c = dio->op.c;
+
+	closure_debug_destroy(cl);
+
+	dio->op.error = bch2_journal_error(&c->journal);
+
+	bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	dio->flush = 0;
+
+	closure_init(&dio->op.cl, NULL);
+
+	if (!dio->op.error) {
+		ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+		if (ret) {
+			dio->op.error = ret;
+		} else {
+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+						     &dio->op.cl);
+			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+		}
+	}
+
+	if (dio->sync) {
+		closure_sync(&dio->op.cl);
+		closure_debug_destroy(&dio->op.cl);
+	} else {
+		continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+	}
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	bool sync = dio->sync;
+	long ret;
+
+	if (unlikely(dio->flush)) {
+		bch2_dio_write_flush(dio);
+		if (!sync)
+			return -EIOCBQUEUED;
+	}
+
+	bch2_pagecache_block_put(inode);
+
+	kfree(dio->iov);
+
+	ret = dio->op.error ?: ((long) dio->written << 9);
+	bio_put(&dio->op.wbio.bio);
+
+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
+	inode_dio_end(&inode->v);
+
+	if (ret < 0)
+		ret = bch2_err_class(ret);
+
+	if (!sync) {
+		req->ki_complete(req, ret);
+		ret = -EIOCBQUEUED;
+	}
+	return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct bch_inode_info *inode = dio->inode;
+	struct bio *bio = &dio->op.wbio.bio;
+
+	req->ki_pos	+= (u64) dio->op.written << 9;
+	dio->written	+= dio->op.written;
+
+	if (dio->extending) {
+		spin_lock(&inode->v.i_lock);
+		if (req->ki_pos > inode->v.i_size)
+			i_size_write(&inode->v, req->ki_pos);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+
+	bio_release_pages(bio, false);
+
+	if (unlikely(dio->op.error))
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+	struct bch_fs *c = dio->op.c;
+	struct kiocb *req = dio->req;
+	struct address_space *mapping = dio->mapping;
+	struct bch_inode_info *inode = dio->inode;
+	struct bch_io_opts opts;
+	struct bio *bio = &dio->op.wbio.bio;
+	unsigned unaligned, iter_count;
+	bool sync = dio->sync, dropped_locks;
+	long ret;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	while (1) {
+		iter_count = dio->iter.count;
+
+		EBUG_ON(current->faults_disabled_mapping);
+		current->faults_disabled_mapping = mapping;
+
+		ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+		dropped_locks = fdm_dropped_locks();
+
+		current->faults_disabled_mapping = NULL;
+
+		/*
+		 * If the fault handler returned an error but also signalled
+		 * that it dropped & retook ei_pagecache_lock, we just need to
+		 * re-shoot down the page cache and retry:
+		 */
+		if (dropped_locks && ret)
+			ret = 0;
+
+		if (unlikely(ret < 0))
+			goto err;
+
+		if (unlikely(dropped_locks)) {
+			ret = bch2_write_invalidate_inode_pages_range(mapping,
+					req->ki_pos,
+					req->ki_pos + iter_count - 1);
+			if (unlikely(ret))
+				goto err;
+
+			if (!bio->bi_iter.bi_size)
+				continue;
+		}
+
+		unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+		bio->bi_iter.bi_size -= unaligned;
+		iov_iter_revert(&dio->iter, unaligned);
+
+		if (!bio->bi_iter.bi_size) {
+			/*
+			 * bio_iov_iter_get_pages was only able to get <
+			 * blocksize worth of pages:
+			 */
+			ret = -EFAULT;
+			goto err;
+		}
+
+		bch2_write_op_init(&dio->op, c, opts);
+		dio->op.end_io		= sync
+			? NULL
+			: bch2_dio_write_loop_async;
+		dio->op.target		= dio->op.opts.foreground_target;
+		dio->op.write_point	= writepoint_hashed((unsigned long) current);
+		dio->op.nr_replicas	= dio->op.opts.data_replicas;
+		dio->op.subvol		= inode->ei_subvol;
+		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
+
+		if (sync)
+			dio->op.flags |= BCH_WRITE_SYNC;
+		dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+		ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+						 bio_sectors(bio), true);
+		if (unlikely(ret))
+			goto err;
+
+		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+						dio->op.opts.data_replicas, 0);
+		if (unlikely(ret) &&
+		    !bch2_dio_write_check_allocated(dio))
+			goto err;
+
+		task_io_account_write(bio->bi_iter.bi_size);
+
+		if (unlikely(dio->iter.count) &&
+		    !dio->sync &&
+		    !dio->loop &&
+		    bch2_dio_write_copy_iov(dio))
+			dio->sync = sync = true;
+
+		dio->loop = true;
+		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+		if (!sync)
+			return -EIOCBQUEUED;
+
+		bch2_dio_write_end(dio);
+
+		if (likely(!dio->iter.count) || dio->op.error)
+			break;
+
+		bio_reset(bio, NULL, REQ_OP_WRITE);
+	}
+out:
+	return bch2_dio_write_done(dio);
+err:
+	dio->op.error = ret;
+
+	bio_release_pages(bio, false);
+
+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
+	goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+	struct mm_struct *mm = dio->mm;
+
+	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+	if (mm)
+		kthread_use_mm(mm);
+	bch2_dio_write_loop(dio);
+	if (mm)
+		kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+	struct dio_write *dio = container_of(op, struct dio_write, op);
+
+	bch2_dio_write_end(dio);
+
+	if (likely(!dio->iter.count) || dio->op.error)
+		bch2_dio_write_done(dio);
+	else
+		bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+	struct file *file = req->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct dio_write *dio;
+	struct bio *bio;
+	bool locked = true, extending;
+	ssize_t ret;
+
+	prefetch(&c->opts);
+	prefetch((void *) &c->opts + 64);
+	prefetch(&inode->ei_inode);
+	prefetch((void *) &inode->ei_inode + 64);
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(req, iter);
+	if (unlikely(ret <= 0))
+		goto err;
+
+	ret = file_remove_privs(file);
+	if (unlikely(ret))
+		goto err;
+
+	ret = file_update_time(file);
+	if (unlikely(ret))
+		goto err;
+
+	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+		goto err;
+
+	inode_dio_begin(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	extending = req->ki_pos + iter->count > inode->v.i_size;
+	if (!extending) {
+		inode_unlock(&inode->v);
+		locked = false;
+	}
+
+	bio = bio_alloc_bioset(NULL,
+			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+			       REQ_OP_WRITE,
+			       GFP_KERNEL,
+			       &c->dio_write_bioset);
+	dio = container_of(bio, struct dio_write, op.wbio.bio);
+	dio->req		= req;
+	dio->mapping		= mapping;
+	dio->inode		= inode;
+	dio->mm			= current->mm;
+	dio->iov		= NULL;
+	dio->loop		= false;
+	dio->extending		= extending;
+	dio->sync		= is_sync_kiocb(req) || extending;
+	dio->flush		= iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+	dio->quota_res.sectors	= 0;
+	dio->written		= 0;
+	dio->iter		= *iter;
+	dio->op.c		= c;
+
+	if (unlikely(mapping->nrpages)) {
+		ret = bch2_write_invalidate_inode_pages_range(mapping,
+						req->ki_pos,
+						req->ki_pos + iter->count - 1);
+		if (unlikely(ret))
+			goto err_put_bio;
+	}
+
+	ret = bch2_dio_write_loop(dio);
+err:
+	if (locked)
+		inode_unlock(&inode->v);
+	return ret;
+err_put_bio:
+	bch2_pagecache_block_put(inode);
+	bio_put(bio);
+	inode_dio_end(&inode->v);
+	goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+	if (bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, op.wbio.bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
new file mode 100644
index 000000000000..814621ec7f81
--- /dev/null
+++ b/fs/bcachefs/fs-io-direct.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
new file mode 100644
index 000000000000..ff664fd0d8ef
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+				     loff_t start, u64 end,
+				     fgf_t fgp_flags, gfp_t gfp,
+				     folios *fs)
+{
+	struct folio *f;
+	u64 pos = start;
+	int ret = 0;
+
+	while (pos < end) {
+		if ((u64) pos >= (u64) start + (1ULL << 20))
+			fgp_flags &= ~FGP_CREAT;
+
+		ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
+		if (ret)
+			break;
+
+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+		if (IS_ERR_OR_NULL(f))
+			break;
+
+		BUG_ON(fs->nr && folio_pos(f) != pos);
+
+		pos = folio_end_pos(f);
+		darray_push(fs, f);
+	}
+
+	if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
+		ret = -ENOMEM;
+
+	return fs->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+					    loff_t start, loff_t end)
+{
+	int ret;
+
+	/*
+	 * XXX: the way this is currently implemented, we can spin if a process
+	 * is continually redirtying a specific page
+	 */
+	do {
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = filemap_write_and_wait_range(mapping, start, end);
+		if (ret)
+			break;
+
+		if (!mapping->nrpages)
+			return 0;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				start >> PAGE_SHIFT,
+				end >> PAGE_SHIFT);
+	} while (ret == -EBUSY);
+
+	return ret;
+}
+
+#if 0
+/* Useful for debug tracing: */
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)	#n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+	NULL
+};
+#endif
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_dirty;
+	case SECTOR_reserved:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_dirty:
+		return SECTOR_unallocated;
+	case SECTOR_dirty_reserved:
+		return SECTOR_reserved;
+	default:
+		return state;
+	}
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+	switch (state) {
+	case SECTOR_unallocated:
+		return SECTOR_reserved;
+	case SECTOR_dirty:
+		return SECTOR_dirty_reserved;
+	default:
+		return state;
+	}
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	struct bch_folio *s;
+
+	s = kzalloc(sizeof(*s) +
+		    sizeof(struct bch_folio_sector) *
+		    folio_sectors(folio), gfp);
+	if (!s)
+		return NULL;
+
+	spin_lock_init(&s->lock);
+	folio_attach_private(folio, s);
+	return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+	if (bkey_extent_is_reservation(k))
+		return SECTOR_reserved;
+	if (bkey_extent_is_allocation(k.k))
+		return SECTOR_allocated;
+	return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+			     unsigned pg_offset, unsigned pg_len,
+			     unsigned nr_ptrs, unsigned state)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	BUG_ON(pg_offset >= sectors);
+	BUG_ON(pg_offset + pg_len > sectors);
+
+	spin_lock(&s->lock);
+
+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
+		s->s[i].nr_replicas	= nr_ptrs;
+		bch2_folio_sector_set(folio, s, i, state);
+	}
+
+	if (i == sectors)
+		s->uptodate = true;
+
+	spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+		   struct folio **fs, unsigned nr_folios)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_folio *s;
+	u64 offset = folio_sector(fs[0]);
+	unsigned folio_idx;
+	u32 snapshot;
+	bool need_set = false;
+	int ret;
+
+	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+		if (!s)
+			return -ENOMEM;
+
+		need_set |= !s->uptodate;
+	}
+
+	if (!need_set)
+		return 0;
+
+	folio_idx = 0;
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inum.inum, offset, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+		unsigned state = bkey_to_sector_state(k);
+
+		while (folio_idx < nr_folios) {
+			struct folio *folio = fs[folio_idx];
+			u64 folio_start	= folio_sector(folio);
+			u64 folio_end	= folio_end_sector(folio);
+			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+				folio_start;
+			unsigned folio_len = min(k.k->p.offset, folio_end) -
+				folio_offset - folio_start;
+
+			BUG_ON(k.k->p.offset < folio_start);
+			BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+			if (!bch2_folio(folio)->uptodate)
+				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+			if (k.k->p.offset < folio_end)
+				break;
+			folio_idx++;
+		}
+
+		if (folio_idx == nr_folios)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+	struct bvec_iter iter;
+	struct folio_vec fv;
+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+	unsigned state = bkey_to_sector_state(k);
+
+	bio_for_each_folio(fv, bio, iter)
+		__bch2_folio_set(fv.fv_folio,
+				 fv.fv_offset >> 9,
+				 fv.fv_len >> 9,
+				 nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+				     u64 start, u64 end)
+{
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++)
+					s->s[j].nr_replicas = 0;
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+				  u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+	struct folio_batch fbatch;
+	s64 i_sectors_delta = 0;
+	unsigned i, j;
+
+	if (end <= start)
+		return;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(inode->v.i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+			u64 folio_start = folio_sector(folio);
+			u64 folio_end = folio_end_sector(folio);
+			unsigned folio_offset = max(start, folio_start) - folio_start;
+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+			struct bch_folio *s;
+
+			BUG_ON(end <= folio_start);
+
+			folio_lock(folio);
+			s = bch2_folio(folio);
+
+			if (s) {
+				spin_lock(&s->lock);
+				for (j = folio_offset; j < folio_offset + folio_len; j++) {
+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+					bch2_folio_sector_set(folio, s, j,
+						folio_sector_reserve(s->s[j].state));
+				}
+				spin_unlock(&s->lock);
+			}
+
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+					  unsigned nr_replicas)
+{
+	return max(0, (int) nr_replicas -
+		   s->nr_replicas -
+		   s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+				struct bch_inode_info *inode,
+				struct folio *folio, bool check_enospc)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned nr_replicas = inode_nr_replicas(c, inode);
+	struct disk_reservation disk_res = { 0 };
+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	for (i = 0; i < sectors; i++)
+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+	if (!disk_res_sectors)
+		return 0;
+
+	ret = bch2_disk_reservation_get(c, &disk_res,
+					disk_res_sectors, 1,
+					!check_enospc
+					? BCH_DISK_RESERVATION_NOFAIL
+					: 0);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < sectors; i++)
+		s->s[i].replicas_reserved +=
+			sectors_to_reserve(&s->s[i], nr_replicas);
+
+	return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	bch2_disk_reservation_put(c, &res->disk);
+	bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct folio *folio,
+			struct bch2_folio_reservation *res,
+			unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio_create(folio, 0);
+	unsigned i, disk_sectors = 0, quota_sectors = 0;
+	int ret;
+
+	if (!s)
+		return -ENOMEM;
+
+	BUG_ON(!s->uptodate);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		disk_sectors += sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+		quota_sectors += s->s[i].state == SECTOR_unallocated;
+	}
+
+	if (disk_sectors) {
+		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	if (quota_sectors) {
+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
+						 quota_sectors, true);
+		if (unlikely(ret)) {
+			struct disk_reservation tmp = {
+				.sectors = disk_sectors
+			};
+
+			bch2_disk_reservation_put(c, &tmp);
+			res->disk.sectors -= disk_sectors;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_folio *s = bch2_folio(folio);
+	struct disk_reservation disk_res = { 0 };
+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+	if (!s)
+		return;
+
+	EBUG_ON(!folio_test_locked(folio));
+	EBUG_ON(folio_test_writeback(folio));
+
+	for (i = 0; i < sectors; i++) {
+		disk_res.sectors += s->s[i].replicas_reserved;
+		s->s[i].replicas_reserved = 0;
+
+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+	}
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+	bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+			  struct bch_inode_info *inode,
+			  struct folio *folio,
+			  struct bch2_folio_reservation *res,
+			  unsigned offset, unsigned len)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, dirty_sectors = 0;
+
+	WARN_ON((u64) folio_pos(folio) + offset + len >
+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+	BUG_ON(!s->uptodate);
+
+	spin_lock(&s->lock);
+
+	for (i = round_down(offset, block_bytes(c)) >> 9;
+	     i < round_up(offset + len, block_bytes(c)) >> 9;
+	     i++) {
+		unsigned sectors = sectors_to_reserve(&s->s[i],
+						res->disk.nr_replicas);
+
+		/*
+		 * This can happen if we race with the error path in
+		 * bch2_writepage_io_done():
+		 */
+		sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+		s->s[i].replicas_reserved += sectors;
+		res->disk.sectors -= sectors;
+
+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+	}
+
+	spin_unlock(&s->lock);
+
+	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+	if (!folio_test_dirty(folio))
+		filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+	struct file *file = vmf->vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct address_space *fdm = faults_disabled_mapping();
+	struct bch_inode_info *inode = file_bch_inode(file);
+	vm_fault_t ret;
+
+	if (fdm == mapping)
+		return VM_FAULT_SIGBUS;
+
+	/* Lock ordering: */
+	if (fdm > mapping) {
+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+		if (bch2_pagecache_add_tryget(inode))
+			goto got_lock;
+
+		bch2_pagecache_block_put(fdm_host);
+
+		bch2_pagecache_add_get(inode);
+		bch2_pagecache_add_put(inode);
+
+		bch2_pagecache_block_get(fdm_host);
+
+		/* Signal that lock has been dropped: */
+		set_fdm_dropped_locks();
+		return VM_FAULT_SIGBUS;
+	}
+
+	bch2_pagecache_add_get(inode);
+got_lock:
+	ret = filemap_fault(vmf);
+	bch2_pagecache_add_put(inode);
+
+	return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+	struct folio *folio = page_folio(vmf->page);
+	struct file *file = vmf->vma->vm_file;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct address_space *mapping = file->f_mapping;
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch2_folio_reservation res;
+	unsigned len;
+	loff_t isize;
+	vm_fault_t ret;
+
+	bch2_folio_reservation_init(c, inode, &res);
+
+	sb_start_pagefault(inode->v.i_sb);
+	file_update_time(file);
+
+	/*
+	 * Not strictly necessary, but helps avoid dio writes livelocking in
+	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
+	 * page lock before invalidating page
+	 */
+	bch2_pagecache_add_get(inode);
+
+	folio_lock(folio);
+	isize = i_size_read(&inode->v);
+
+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+		folio_unlock(folio);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+		folio_unlock(folio);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+	bch2_folio_reservation_put(c, inode, &res);
+
+	folio_wait_stable(folio);
+	ret = VM_FAULT_LOCKED;
+out:
+	bch2_pagecache_add_put(inode);
+	sb_end_pagefault(inode->v.i_sb);
+
+	return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+	if (offset || length < folio_size(folio))
+		return;
+
+	bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
+		return false;
+
+	bch2_clear_folio_bits(folio);
+	return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+			     unsigned min_replicas)
+{
+	struct bch_folio *s = bch2_folio(folio);
+	unsigned i, sectors = folio_sectors(folio);
+
+	if (s)
+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+			if (s->s[i].state >= SECTOR_dirty &&
+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+				return i << SECTOR_SHIFT;
+
+	return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct folio_batch fbatch;
+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
+	pgoff_t index		= start_index;
+	unsigned i;
+	loff_t ret;
+	int offset;
+
+	folio_batch_init(&fbatch);
+
+	while (filemap_get_folios(vinode->i_mapping,
+				  &index, end_index, &fbatch)) {
+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
+			struct folio *folio = fbatch.folios[i];
+
+			if (!nonblock) {
+				folio_lock(folio);
+			} else if (!folio_trylock(folio)) {
+				folio_batch_release(&fbatch);
+				return -EAGAIN;
+			}
+
+			offset = folio_data_offset(folio,
+					max(folio_pos(folio), start_offset),
+					min_replicas);
+			if (offset >= 0) {
+				ret = clamp(folio_pos(folio) + offset,
+					    start_offset, end_offset);
+				folio_unlock(folio);
+				folio_batch_release(&fbatch);
+				return ret;
+			}
+			folio_unlock(folio);
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return end_offset;
+}
+
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+			      unsigned min_replicas, bool nonblock)
+{
+	struct folio *folio;
+	struct bch_folio *s;
+	unsigned i, sectors;
+	int ret = -ENOENT;
+
+	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+
+	s = bch2_folio(folio);
+	if (!s)
+		goto unlock;
+
+	sectors = folio_sectors(folio);
+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+		if (s->s[i].state < SECTOR_dirty ||
+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+			*offset = max(*offset,
+				      folio_pos(folio) + (i << SECTOR_SHIFT));
+			goto unlock;
+		}
+
+	*offset = folio_end_pos(folio);
+	ret = 0;
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+				loff_t start_offset,
+				loff_t end_offset,
+				unsigned min_replicas,
+				bool nonblock)
+{
+	struct address_space *mapping = vinode->i_mapping;
+	loff_t offset = start_offset;
+	loff_t ret = 0;
+
+	while (!ret && offset < end_offset)
+		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
+
+	if (ret && ret != -ENOENT)
+		return ret;
+	return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+			 u64 *hole_start,
+			 u64 *hole_end,
+			 unsigned min_replicas,
+			 bool nonblock)
+{
+	loff_t ret;
+
+	ret = bch2_seek_pagecache_hole(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_start = ret;
+
+	if (*hole_start == *hole_end)
+		return 0;
+
+	ret = bch2_seek_pagecache_data(inode,
+		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+	if (ret < 0)
+		return ret;
+
+	*hole_end = ret;
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
new file mode 100644
index 000000000000..27f712ae37a6
--- /dev/null
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+				     u64, fgf_t, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+	return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+	return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+	return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+	return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()	\
+	x(unallocated)			\
+	x(reserved)			\
+	x(dirty)			\
+	x(dirty_reserved)		\
+	x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)	SECTOR_##n,
+	BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
+	unsigned		nr_replicas:4;
+
+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+	unsigned		replicas_reserved:4;
+
+	/* i_sectors: */
+	enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+	spinlock_t		lock;
+	atomic_t		write_count;
+	/*
+	 * Is the sector state up to date with the btree?
+	 * (Not the data itself)
+	 */
+	bool			uptodate;
+	struct bch_folio_sector	s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+			     struct bch_folio *s,
+			     unsigned i, unsigned n)
+{
+	s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+	u64 f_offset = pos - folio_pos(folio);
+
+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+	return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+	kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+	__bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+	return folio_has_private(folio)
+		? (struct bch_folio *) folio_get_private(folio)
+		: NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+	EBUG_ON(!folio_test_locked(folio));
+
+	return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+	struct disk_reservation	disk;
+	struct quota_res	quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	/* XXX: this should not be open coded */
+	return inode->ei_inode.bi_data_replicas
+		? inode->ei_inode.bi_data_replicas - 1
+		: c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+			struct bch_inode_info *inode,
+			struct bch2_folio_reservation *res)
+{
+	memset(res, 0, sizeof(*res));
+
+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+				struct bch_inode_info *,
+				struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+			struct bch_inode_info *,
+			struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+			struct bch_inode_info *,
+			struct folio *,
+			struct bch2_folio_reservation *,
+			unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+			  struct bch_inode_info *,
+			  struct folio *,
+			  struct bch2_folio_reservation *,
+			  unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
new file mode 100644
index 000000000000..b0e8144ec550
--- /dev/null
+++ b/fs/bcachefs/fs-io.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "io_misc.h"
+#include "keylist.h"
+#include "quota.h"
+#include "reflink.h"
+#include "trace.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/uio.h>
+
+#include <trace/events/writeback.h>
+
+struct nocow_flush {
+	struct closure	*cl;
+	struct bch_dev	*ca;
+	struct bio	bio;
+};
+
+static void nocow_flush_endio(struct bio *_bio)
+{
+
+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
+
+	closure_put(bio->cl);
+	percpu_ref_put(&bio->ca->io_ref);
+	bio_put(&bio->bio);
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct closure *cl)
+{
+	struct nocow_flush *bio;
+	struct bch_dev *ca;
+	struct bch_devs_mask devs;
+	unsigned dev;
+
+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+	if (dev == BCH_SB_MEMBERS_MAX)
+		return;
+
+	devs = inode->ei_devs_need_flush;
+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
+
+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca && !percpu_ref_tryget(&ca->io_ref))
+			ca = NULL;
+		rcu_read_unlock();
+
+		if (!ca)
+			continue;
+
+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+						    REQ_OP_FLUSH,
+						    GFP_KERNEL,
+						    &c->nocow_flush_bioset),
+				   struct nocow_flush, bio);
+		bio->cl			= cl;
+		bio->ca			= ca;
+		bio->bio.bi_end_io	= nocow_flush_endio;
+		closure_bio_submit(&bio->bio, cl);
+	}
+}
+
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+					 struct bch_inode_info *inode)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+	closure_sync(&cl);
+
+	return 0;
+}
+
+/* i_size updates: */
+
+struct inode_new_size {
+	loff_t		new_size;
+	u64		now;
+	unsigned	fields;
+};
+
+static int inode_set_size(struct btree_trans *trans,
+			  struct bch_inode_info *inode,
+			  struct bch_inode_unpacked *bi,
+			  void *p)
+{
+	struct inode_new_size *s = p;
+
+	bi->bi_size = s->new_size;
+	if (s->fields & ATTR_ATIME)
+		bi->bi_atime = s->now;
+	if (s->fields & ATTR_MTIME)
+		bi->bi_mtime = s->now;
+	if (s->fields & ATTR_CTIME)
+		bi->bi_ctime = s->now;
+
+	return 0;
+}
+
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       loff_t new_size, unsigned fields)
+{
+	struct inode_new_size s = {
+		.new_size	= new_size,
+		.now		= bch2_current_time(c),
+		.fields		= fields,
+	};
+
+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
+
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, s64 sectors)
+{
+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+				inode->ei_inode.bi_sectors);
+	inode->v.i_blocks += sectors;
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+	if (quota_res &&
+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+	    sectors > 0) {
+		BUG_ON(sectors > quota_res->sectors);
+		BUG_ON(sectors > inode->ei_quota_reserved);
+
+		quota_res->sectors -= sectors;
+		inode->ei_quota_reserved -= sectors;
+	} else {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+	}
+#endif
+}
+
+/* fsync: */
+
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c,
+			    struct bch_inode_info *inode)
+{
+	struct bch_inode_unpacked u;
+	int ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
+	if (ret)
+		return ret;
+
+	return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+		bch2_inode_flush_nocow_writes(c, inode);
+}
+
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret, ret2, ret3;
+
+	ret = file_write_and_wait_range(file, start, end);
+	ret2 = sync_inode_metadata(&inode->v, 1);
+	ret3 = bch2_flush_inode(c, inode);
+
+	return bch2_err_class(ret ?: ret2 ?: ret3);
+}
+
+/* truncate: */
+
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+				 struct bpos start,
+				 struct bpos end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
+			ret = 1;
+			break;
+		}
+	start = iter.pos;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int __bch2_truncate_folio(struct bch_inode_info *inode,
+				 pgoff_t index, loff_t start, loff_t end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_folio *s;
+	unsigned start_offset;
+	unsigned end_offset;
+	unsigned i;
+	struct folio *folio;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+	u64 end_pos;
+
+	folio = filemap_lock_folio(mapping, index);
+	if (IS_ERR_OR_NULL(folio)) {
+		/*
+		 * XXX: we're doing two index lookups when we end up reading the
+		 * folio
+		 */
+		ret = range_has_data(c, inode->ei_subvol,
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
+				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
+		if (ret <= 0)
+			return ret;
+
+		folio = __filemap_get_folio(mapping, index,
+					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
+		if (IS_ERR_OR_NULL(folio)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	BUG_ON(start	>= folio_end_pos(folio));
+	BUG_ON(end	<= folio_pos(folio));
+
+	start_offset	= max(start, folio_pos(folio)) - folio_pos(folio);
+	end_offset	= min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio);
+
+	/* Folio boundary? Nothing to do */
+	if (start_offset == 0 &&
+	    end_offset == folio_size(folio)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	s = bch2_folio_create(folio, 0);
+	if (!s) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	if (!folio_test_uptodate(folio)) {
+		ret = bch2_read_single_folio(folio, mapping);
+		if (ret)
+			goto unlock;
+	}
+
+	ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+	if (ret)
+		goto unlock;
+
+	for (i = round_up(start_offset, block_bytes(c)) >> 9;
+	     i < round_down(end_offset, block_bytes(c)) >> 9;
+	     i++) {
+		s->s[i].nr_replicas	= 0;
+
+		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
+		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
+	}
+
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	/*
+	 * Caller needs to know whether this folio will be written out by
+	 * writeback - doing an i_size update if necessary - or whether it will
+	 * be responsible for the i_size update.
+	 *
+	 * Note that we shouldn't ever see a folio beyond EOF, but check and
+	 * warn if so. This has been observed by failure to clean up folios
+	 * after a short write and there's still a chance reclaim will fix
+	 * things up.
+	 */
+	WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size);
+	end_pos = folio_end_pos(folio);
+	if (inode->v.i_size > folio_pos(folio))
+		end_pos = min_t(u64, inode->v.i_size, end_pos);
+	ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty;
+
+	folio_zero_segment(folio, start_offset, end_offset);
+
+	/*
+	 * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
+	 *
+	 * XXX: because we aren't currently tracking whether the folio has actual
+	 * data in it (vs. just 0s, or only partially written) this wrong. ick.
+	 */
+	BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false));
+
+	/*
+	 * This removes any writeable userspace mappings; we need to force
+	 * .page_mkwrite to be called again before any mmapped writes, to
+	 * redirty the full page:
+	 */
+	folio_mkclean(folio);
+	filemap_dirty_folio(mapping, folio);
+unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+out:
+	return ret;
+}
+
+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from)
+{
+	return __bch2_truncate_folio(inode, from >> PAGE_SHIFT,
+				     from, ANYSINT_MAX(loff_t));
+}
+
+static int bch2_truncate_folios(struct bch_inode_info *inode,
+				loff_t start, loff_t end)
+{
+	int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT,
+					start, end);
+
+	if (ret >= 0 &&
+	    start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+		ret = __bch2_truncate_folio(inode,
+					(end - 1) >> PAGE_SHIFT,
+					start, end);
+	return ret;
+}
+
+static int bch2_extend(struct mnt_idmap *idmap,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_unpacked *inode_u,
+		       struct iattr *iattr)
+{
+	struct address_space *mapping = inode->v.i_mapping;
+	int ret;
+
+	/*
+	 * sync appends:
+	 *
+	 * this has to be done _before_ extending i_size:
+	 */
+	ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
+	if (ret)
+		return ret;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	return bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+int bchfs_truncate(struct mnt_idmap *idmap,
+		  struct bch_inode_info *inode, struct iattr *iattr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	struct bch_inode_unpacked inode_u;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	/*
+	 * If the truncate call with change the size of the file, the
+	 * cmtimes should be updated. If the size will not change, we
+	 * do not need to update the cmtimes.
+	 */
+	if (iattr->ia_size != inode->v.i_size) {
+		if (!(iattr->ia_valid & ATTR_MTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_mtime);
+		if (!(iattr->ia_valid & ATTR_CTIME))
+			ktime_get_coarse_real_ts64(&iattr->ia_ctime);
+		iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+	}
+
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	/*
+	 * check this before next assertion; on filesystem error our normal
+	 * invariants are a bit broken (truncate has to truncate the page cache
+	 * before the inode).
+	 */
+	ret = bch2_journal_error(&c->journal);
+	if (ret)
+		goto err;
+
+	WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+		  inode->v.i_size < inode_u.bi_size,
+		  "truncate spotted in mem i_size < btree i_size: %llu < %llu\n",
+		  (u64) inode->v.i_size, inode_u.bi_size);
+
+	if (iattr->ia_size > inode->v.i_size) {
+		ret = bch2_extend(idmap, inode, &inode_u, iattr);
+		goto err;
+	}
+
+	iattr->ia_valid &= ~ATTR_SIZE;
+
+	ret = bch2_truncate_folio(inode, iattr->ia_size);
+	if (unlikely(ret < 0))
+		goto err;
+
+	truncate_setsize(&inode->v, iattr->ia_size);
+
+	/*
+	 * When extending, we're going to write the new i_size to disk
+	 * immediately so we need to flush anything above the current on disk
+	 * i_size first:
+	 *
+	 * Also, when extending we need to flush the page that i_size currently
+	 * straddles - if it's mapped to userspace, we need to ensure that
+	 * userspace has to redirty it and call .mkwrite -> set_page_dirty
+	 * again to allocate the part of the page that was extended.
+	 */
+	if (iattr->ia_size > inode_u.bi_size)
+		ret = filemap_write_and_wait_range(mapping,
+				inode_u.bi_size,
+				iattr->ia_size - 1);
+	else if (iattr->ia_size & (PAGE_SIZE - 1))
+		ret = filemap_write_and_wait_range(mapping,
+				round_down(iattr->ia_size, PAGE_SIZE),
+				iattr->ia_size - 1);
+	if (ret)
+		goto err;
+
+	ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	if (unlikely(ret)) {
+		/*
+		 * If we error here, VFS caches are now inconsistent with btree
+		 */
+		set_bit(EI_INODE_ERROR, &inode->ei_flags);
+		goto err;
+	}
+
+	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+				!bch2_journal_error(&c->journal), c,
+				"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+				inode->v.i_ino, (u64) inode->v.i_blocks,
+				inode->ei_inode.bi_sectors);
+
+	ret = bch2_setattr_nonsize(idmap, inode, iattr);
+err:
+	bch2_pagecache_block_put(inode);
+	return bch2_err_class(ret);
+}
+
+/* fallocate: */
+
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi, void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+	return 0;
+}
+
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 end		= offset + len;
+	u64 block_start	= round_up(offset, block_bytes(c));
+	u64 block_end	= round_down(end, block_bytes(c));
+	bool truncated_last_page;
+	int ret = 0;
+
+	ret = bch2_truncate_folios(inode, offset, end);
+	if (unlikely(ret < 0))
+		goto err;
+
+	truncated_last_page = ret;
+
+	truncate_pagecache_range(&inode->v, offset, end - 1);
+
+	if (block_start < block_end) {
+		s64 i_sectors_delta = 0;
+
+		ret = bch2_fpunch(c, inode_inum(inode),
+				  block_start >> 9, block_end >> 9,
+				  &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	if (end >= inode->v.i_size && !truncated_last_page) {
+		ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+					    ATTR_MTIME|ATTR_CTIME);
+	} else {
+		ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+				       ATTR_MTIME|ATTR_CTIME);
+	}
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	return ret;
+}
+
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+				   loff_t offset, loff_t len,
+				   bool insert)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct address_space *mapping = inode->v.i_mapping;
+	s64 i_sectors_delta = 0;
+	int ret = 0;
+
+	if ((offset | len) & (block_bytes(c) - 1))
+		return -EINVAL;
+
+	if (insert) {
+		if (offset >= inode->v.i_size)
+			return -EINVAL;
+	} else {
+		if (offset + len >= inode->v.i_size)
+			return -EINVAL;
+	}
+
+	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+	if (ret)
+		return ret;
+
+	if (insert)
+		i_size_write(&inode->v, inode->v.i_size + len);
+
+	ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+				     insert, &i_sectors_delta);
+	if (!ret && !insert)
+		i_size_write(&inode->v, inode->v.i_size - len);
+	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+	return ret;
+}
+
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			     u64 start_sector, u64 end_sector)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bpos end_pos = POS(inode->v.i_ino, end_sector);
+	struct bch_io_opts opts;
+	int ret = 0;
+
+	bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			POS(inode->v.i_ino, start_sector),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (!ret && bkey_lt(iter.pos, end_pos)) {
+		s64 i_sectors_delta = 0;
+		struct quota_res quota_res = { 0 };
+		struct bkey_s_c k;
+		unsigned sectors;
+		bool is_allocation;
+		u64 hole_start, hole_end;
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans,
+					inode->ei_subvol, &snapshot);
+		if (ret)
+			goto bkey_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		if ((ret = bkey_err(k)))
+			goto bkey_err;
+
+		hole_start	= iter.pos.offset;
+		hole_end	= bpos_min(k.k->p, end_pos).offset;
+		is_allocation	= bkey_extent_is_allocation(k.k);
+
+		/* already reserved */
+		if (bkey_extent_is_reservation(k) &&
+		    bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		if (bkey_extent_is_data(k.k) &&
+		    !(mode & FALLOC_FL_ZERO_RANGE)) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+			/*
+			 * Lock ordering - can't be holding btree locks while
+			 * blocking on a folio lock:
+			 */
+			if (bch2_clamp_data_hole(&inode->v,
+						 &hole_start,
+						 &hole_end,
+						 opts.data_replicas, true))
+				ret = drop_locks_do(trans,
+					(bch2_clamp_data_hole(&inode->v,
+							      &hole_start,
+							      &hole_end,
+							      opts.data_replicas, false), 0));
+			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+
+			if (ret)
+				goto bkey_err;
+
+			if (hole_start == hole_end)
+				continue;
+		}
+
+		sectors	= hole_end - hole_start;
+
+		if (!is_allocation) {
+			ret = bch2_quota_reservation_add(c, inode,
+					&quota_res, sectors, true);
+			if (unlikely(ret))
+				goto bkey_err;
+		}
+
+		ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
+					    sectors, opts, &i_sectors_delta,
+					    writepoint_hashed((unsigned long) current));
+		if (ret)
+			goto bkey_err;
+
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+
+		drop_locks_do(trans,
+			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+bkey_err:
+		bch2_quota_reservation_put(c, inode, &quota_res);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			ret = 0;
+	}
+
+	if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) {
+		struct quota_res quota_res = { 0 };
+		s64 i_sectors_delta = 0;
+
+		bch2_fpunch_at(trans, &iter, inode_inum(inode),
+			       end_sector, &i_sectors_delta);
+		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+		bch2_quota_reservation_put(c, inode, &quota_res);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	u64 end		= offset + len;
+	u64 block_start	= round_down(offset,	block_bytes(c));
+	u64 block_end	= round_up(end,		block_bytes(c));
+	bool truncated_last_page = false;
+	int ret, ret2 = 0;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+		ret = inode_newsize_ok(&inode->v, end);
+		if (ret)
+			return ret;
+	}
+
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = bch2_truncate_folios(inode, offset, end);
+		if (unlikely(ret < 0))
+			return ret;
+
+		truncated_last_page = ret;
+
+		truncate_pagecache_range(&inode->v, offset, end - 1);
+
+		block_start	= round_up(offset,	block_bytes(c));
+		block_end	= round_down(end,	block_bytes(c));
+	}
+
+	ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
+
+	/*
+	 * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+	 * so that the VFS cache i_size is consistent with the btree i_size:
+	 */
+	if (ret &&
+	    !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)))
+		return ret;
+
+	if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+		end = inode->v.i_size;
+
+	if (end >= inode->v.i_size &&
+	    (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+	     !(mode & FALLOC_FL_KEEP_SIZE))) {
+		spin_lock(&inode->v.i_lock);
+		i_size_write(&inode->v, end);
+		spin_unlock(&inode->v.i_lock);
+
+		mutex_lock(&inode->ei_update_lock);
+		ret2 = bch2_write_inode_size(c, inode, end, 0);
+		mutex_unlock(&inode->ei_update_lock);
+	}
+
+	return ret ?: ret2;
+}
+
+long bch2_fallocate_dispatch(struct file *file, int mode,
+			     loff_t offset, loff_t len)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+		return -EROFS;
+
+	inode_lock(&inode->v);
+	inode_dio_wait(&inode->v);
+	bch2_pagecache_block_get(inode);
+
+	ret = file_modified(file);
+	if (ret)
+		goto err;
+
+	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+		ret = bchfs_fallocate(inode, mode, offset, len);
+	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+		ret = bchfs_fpunch(inode, offset, len);
+	else if (mode == FALLOC_FL_INSERT_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+	else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+	else
+		ret = -EOPNOTSUPP;
+err:
+	bch2_pagecache_block_put(inode);
+	inode_unlock(&inode->v);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
+
+	return bch2_err_class(ret);
+}
+
+/*
+ * Take a quota reservation for unallocated blocks in a given file range
+ * Does not check pagecache
+ */
+static int quota_reserve_range(struct bch_inode_info *inode,
+			       struct quota_res *res,
+			       u64 start, u64 end)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 snapshot;
+	u64 sectors = end - start;
+	u64 pos = start;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inode->v.i_ino, pos, snapshot), 0);
+
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
+	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
+	       !(ret = bkey_err(k))) {
+		if (bkey_extent_is_allocation(k.k)) {
+			u64 s = min(end, k.k->p.offset) -
+				max(start, bkey_start_offset(k.k));
+			BUG_ON(s > sectors);
+			sectors -= s;
+		}
+		bch2_btree_iter_advance(&iter);
+	}
+	pos = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+			     struct file *file_dst, loff_t pos_dst,
+			     loff_t len, unsigned remap_flags)
+{
+	struct bch_inode_info *src = file_bch_inode(file_src);
+	struct bch_inode_info *dst = file_bch_inode(file_dst);
+	struct bch_fs *c = src->v.i_sb->s_fs_info;
+	struct quota_res quota_res = { 0 };
+	s64 i_sectors_delta = 0;
+	u64 aligned_len;
+	loff_t ret = 0;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+
+	if ((pos_src & (block_bytes(c) - 1)) ||
+	    (pos_dst & (block_bytes(c) - 1)))
+		return -EINVAL;
+
+	if (src == dst &&
+	    abs(pos_src - pos_dst) < len)
+		return -EINVAL;
+
+	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	inode_dio_wait(&src->v);
+	inode_dio_wait(&dst->v);
+
+	ret = generic_remap_file_range_prep(file_src, pos_src,
+					    file_dst, pos_dst,
+					    &len, remap_flags);
+	if (ret < 0 || len == 0)
+		goto err;
+
+	aligned_len = round_up((u64) len, block_bytes(c));
+
+	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
+				pos_dst, pos_dst + len - 1);
+	if (ret)
+		goto err;
+
+	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
+				  (pos_dst + aligned_len) >> 9);
+	if (ret)
+		goto err;
+
+	file_update_time(file_dst);
+
+	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
+				   (pos_src + aligned_len) >> 9);
+
+	ret = bch2_remap_range(c,
+			       inode_inum(dst), pos_dst >> 9,
+			       inode_inum(src), pos_src >> 9,
+			       aligned_len >> 9,
+			       pos_dst + len, &i_sectors_delta);
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * due to alignment, we might have remapped slightly more than requsted
+	 */
+	ret = min((u64) ret << 9, (u64) len);
+
+	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+
+	spin_lock(&dst->v.i_lock);
+	if (pos_dst + ret > dst->v.i_size)
+		i_size_write(&dst->v, pos_dst + ret);
+	spin_unlock(&dst->v.i_lock);
+
+	if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+	    IS_SYNC(file_inode(file_dst)))
+		ret = bch2_flush_inode(c, dst);
+err:
+	bch2_quota_reservation_put(c, dst, &quota_res);
+	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+
+	return bch2_err_class(ret);
+}
+
+/* fseek: */
+
+static loff_t bch2_seek_data(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
+	u64 isize, next_data = MAX_LFS_FILESIZE;
+	u32 snapshot;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
+			   POS(inode->v.i_ino, U64_MAX),
+			   0, k, ret) {
+		if (bkey_extent_is_data(k.k)) {
+			next_data = max(offset, bkey_start_offset(k.k) << 9);
+			break;
+		} else if (k.k->p.offset >> 9 > isize)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	if (ret)
+		return ret;
+
+	if (next_data > offset)
+		next_data = bch2_seek_pagecache_data(&inode->v,
+					offset, next_data, 0, false);
+
+	if (next_data >= isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
+}
+
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	subvol_inum inum = inode_inum(inode);
+	u64 isize, next_hole = MAX_LFS_FILESIZE;
+	u32 snapshot;
+	int ret;
+
+	isize = i_size_read(&inode->v);
+	if (offset >= isize)
+		return -ENXIO;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
+			   BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->p.inode != inode->v.i_ino) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					offset, MAX_LFS_FILESIZE, 0, false);
+			break;
+		} else if (!bkey_extent_is_data(k.k)) {
+			next_hole = bch2_seek_pagecache_hole(&inode->v,
+					max(offset, bkey_start_offset(k.k) << 9),
+					k.k->p.offset << 9, 0, false);
+
+			if (next_hole < k.k->p.offset << 9)
+				break;
+		} else {
+			offset = max(offset, bkey_start_offset(k.k) << 9);
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	if (ret)
+		return ret;
+
+	if (next_hole > isize)
+		next_hole = isize;
+
+	return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
+}
+
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		ret = generic_file_llseek(file, offset, whence);
+		break;
+	case SEEK_DATA:
+		ret = bch2_seek_data(file, offset);
+		break;
+	case SEEK_HOLE:
+		ret = bch2_seek_hole(file, offset);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return bch2_err_class(ret);
+}
+
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->nocow_flush_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->nocow_flush_bioset,
+			1, offsetof(struct nocow_flush, bio), 0))
+		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
new file mode 100644
index 000000000000..ca70346e68dc
--- /dev/null
+++ b/fs/bcachefs/fs-io.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
+
+#include <linux/uio.h>
+
+struct folio_vec {
+	struct folio	*fv_folio;
+	size_t		fv_offset;
+	size_t		fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+	struct folio *folio	= page_folio(bv.bv_page);
+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+		bv.bv_offset;
+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+	return (struct folio_vec) {
+		.fv_folio	= folio,
+		.fv_offset	= offset,
+		.fv_len		= len,
+	};
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+						    struct bvec_iter iter)
+{
+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)			\
+	for (iter = (start);						\
+	     (iter).bi_size &&						\
+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)				\
+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+	u64				sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res)
+{
+	BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+	inode->ei_quota_reserved -= res->sectors;
+	res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res)
+{
+	if (res->sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_quota_reservation_put(c, inode, res);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      u64 sectors,
+				      bool check_enospc)
+{
+	int ret;
+
+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+	if (likely(!ret)) {
+		inode->ei_quota_reserved += sectors;
+		res->sectors += sectors;
+	}
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+					 struct bch_inode_info *inode,
+					 struct quota_res *res) {}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+				       struct bch_inode_info *inode,
+				       struct quota_res *res) {}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+				      struct bch_inode_info *inode,
+				      struct quota_res *res,
+				      unsigned sectors,
+				      bool check_enospc)
+{
+	return 0;
+}
+
+#endif
+
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+			   struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+				       struct quota_res *quota_res, s64 sectors)
+{
+	if (sectors) {
+		mutex_lock(&inode->ei_quota_lock);
+		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
+		mutex_unlock(&inode->ei_quota_lock);
+	}
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+	current->faults_disabled_mapping =
+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+	return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+			struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+				       struct bch_inode_info *,
+				       loff_t, unsigned);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bchfs_truncate(struct mnt_idmap *,
+		  struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+			     loff_t, loff_t, unsigned);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
new file mode 100644
index 000000000000..14d5cc6f90d7
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.c
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/fsnotify.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
+
+#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT	0x0	/* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH	0x1	/* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH	0x2	/* don't flush log nor data */
+
+struct flags_set {
+	unsigned		mask;
+	unsigned		flags;
+
+	unsigned		projid;
+
+	bool			set_projinherit;
+	bool			projinherit;
+};
+
+static int bch2_inode_flags_set(struct btree_trans *trans,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not inode->i_flags:
+	 */
+	struct flags_set *s = p;
+	unsigned newflags = s->flags;
+	unsigned oldflags = bi->bi_flags & s->mask;
+
+	if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	if (!S_ISREG(bi->bi_mode) &&
+	    !S_ISDIR(bi->bi_mode) &&
+	    (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
+		return -EINVAL;
+
+	if (s->set_projinherit) {
+		bi->bi_fields_set &= ~(1 << Inode_opt_project);
+		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
+	}
+
+	bi->bi_flags &= ~s->mask;
+	bi->bi_flags |= newflags;
+
+	bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+	return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+	return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     void __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+	unsigned uflags;
+	int ret;
+
+	if (get_user(uflags, (int __user *) arg))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+	if (uflags)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto setflags_out;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+		bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+			       ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct fsxattr fa = { 0 };
+
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+
+	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+	if (copy_to_user(arg, &fa, sizeof(fa)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+				      struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct flags_set *s = p;
+
+	if (s->projid != bi->bi_project) {
+		bi->bi_fields_set |= 1U << Inode_opt_project;
+		bi->bi_project = s->projid;
+	}
+
+	return bch2_inode_flags_set(trans, inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+			       struct file *file,
+			       struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+	struct fsxattr fa;
+	int ret;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	s.set_projinherit = true;
+	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
+	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
+	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+	if (fa.fsx_xflags)
+		return -EOPNOTSUPP;
+
+	if (fa.fsx_projid >= U32_MAX)
+		return -EINVAL;
+
+	/*
+	 * inode fields accessible via the xattr interface are stored with a +1
+	 * bias, so that 0 means unset:
+	 */
+	s.projid = fa.fsx_projid + 1;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+		bch2_set_projid(c, inode, fa.fsx_projid) ?:
+		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+			       ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   void *p)
+{
+	struct bch_inode_info *dir = p;
+
+	return !bch2_reinherit_attrs(bi, &dir->ei_inode);
+}
+
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
+				    struct file *file,
+				    struct bch_inode_info *src,
+				    const char __user *name)
+{
+	struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
+	struct bch_inode_info *dst;
+	struct inode *vinode = NULL;
+	char *kname = NULL;
+	struct qstr qstr;
+	int ret = 0;
+	subvol_inum inum;
+
+	kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+	if (!kname)
+		return -ENOMEM;
+
+	ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
+	if (unlikely(ret < 0))
+		goto err1;
+
+	qstr.len	= ret;
+	qstr.name	= kname;
+
+	ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+	if (ret)
+		goto err1;
+
+	vinode = bch2_vfs_inode_get(c, inum);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret)
+		goto err1;
+
+	dst = to_bch_ei(vinode);
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto err2;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	if (inode_attr_changing(src, dst, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst,
+					     src->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err3;
+	}
+
+	ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
+err3:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
+
+	/* return true if we did work */
+	if (ret >= 0)
+		ret = !ret;
+
+	mnt_drop_write_file(file);
+err2:
+	iput(vinode);
+err1:
+	kfree(kname);
+
+	return ret;
+}
+
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+	u32 flags;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, arg))
+		return -EFAULT;
+
+	bch_notice(c, "shutdown by ioctl type %u", flags);
+
+	down_write(&c->vfs_sb->s_umount);
+
+	switch (flags) {
+	case FSOP_GOING_FLAGS_DEFAULT:
+		ret = freeze_bdev(c->vfs_sb->s_bdev);
+		if (ret)
+			goto err;
+
+		bch2_journal_flush(&c->journal);
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		thaw_bdev(c->vfs_sb->s_bdev);
+		break;
+
+	case FSOP_GOING_FLAGS_LOGFLUSH:
+		bch2_journal_flush(&c->journal);
+		fallthrough;
+
+	case FSOP_GOING_FLAGS_NOLOGFLUSH:
+		c->vfs_sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+err:
+	up_write(&c->vfs_sb->s_umount);
+	return ret;
+}
+
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					  struct bch_ioctl_subvolume arg)
+{
+	struct inode *dir;
+	struct bch_inode_info *inode;
+	struct user_namespace *s_user_ns;
+	struct dentry *dst_dentry;
+	struct path src_path, dst_path;
+	int how = LOOKUP_FOLLOW;
+	int error;
+	subvol_inum snapshot_src = { 0 };
+	unsigned lookup_flags = 0;
+	unsigned create_flags = BCH_CREATE_SUBVOL;
+
+	if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+			  BCH_SUBVOL_SNAPSHOT_RO))
+		return -EINVAL;
+
+	if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    (arg.src_ptr ||
+	     (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+		return -EINVAL;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		create_flags |= BCH_CREATE_SNAPSHOT;
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+		create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+	/* why do we need this lock? */
+	down_read(&c->vfs_sb->s_umount);
+
+	if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+		sync_inodes_sb(c->vfs_sb);
+retry:
+	if (arg.src_ptr) {
+		error = user_path_at(arg.dirfd,
+				(const char __user *)(unsigned long)arg.src_ptr,
+				how, &src_path);
+		if (error)
+			goto err1;
+
+		if (src_path.dentry->d_sb->s_fs_info != c) {
+			path_put(&src_path);
+			error = -EXDEV;
+			goto err1;
+		}
+
+		snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+	}
+
+	dst_dentry = user_path_create(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			&dst_path, lookup_flags);
+	error = PTR_ERR_OR_ZERO(dst_dentry);
+	if (error)
+		goto err2;
+
+	if (dst_dentry->d_sb->s_fs_info != c) {
+		error = -EXDEV;
+		goto err3;
+	}
+
+	if (dst_dentry->d_inode) {
+		error = -EEXIST;
+		goto err3;
+	}
+
+	dir = dst_path.dentry->d_inode;
+	if (IS_DEADDIR(dir)) {
+		error = -BCH_ERR_ENOENT_directory_dead;
+		goto err3;
+	}
+
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid())) {
+		error = -EOVERFLOW;
+		goto err3;
+	}
+
+	error = inode_permission(file_mnt_idmap(filp),
+				 dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto err3;
+
+	if (!IS_POSIXACL(dir))
+		arg.mode &= ~current_umask();
+
+	error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+	if (error)
+		goto err3;
+
+	if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+	    !arg.src_ptr)
+		snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+
+	inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
+			      dst_dentry, arg.mode|S_IFDIR,
+			      0, snapshot_src, create_flags);
+	error = PTR_ERR_OR_ZERO(inode);
+	if (error)
+		goto err3;
+
+	d_instantiate(dst_dentry, &inode->v);
+	fsnotify_mkdir(dir, dst_dentry);
+err3:
+	done_path_create(&dst_path, dst_dentry);
+err2:
+	if (arg.src_ptr)
+		path_put(&src_path);
+
+	if (retry_estale(error, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+err1:
+	up_read(&c->vfs_sb->s_umount);
+
+	return error;
+}
+
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+					struct bch_ioctl_subvolume arg)
+{
+	down_write(&c->snapshot_create_lock);
+	long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+	up_write(&c->snapshot_create_lock);
+
+	return ret;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+				struct bch_ioctl_subvolume arg)
+{
+	struct path path;
+	struct inode *dir;
+	int ret = 0;
+
+	if (arg.flags)
+		return -EINVAL;
+
+	ret = user_path_at(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ret;
+
+	if (path.dentry->d_sb->s_fs_info != c) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	dir = path.dentry->d_parent->d_inode;
+
+	ret = __bch2_unlink(dir, path.dentry, true);
+	if (ret)
+		goto err;
+
+	fsnotify_rmdir(dir, path.dentry);
+	d_delete(path.dentry);
+err:
+	path_put(&path);
+	return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	long ret;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		ret = bch2_ioc_getflags(inode, (int __user *) arg);
+		break;
+
+	case FS_IOC_SETFLAGS:
+		ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+		break;
+
+	case FS_IOC_FSGETXATTR:
+		ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+		break;
+
+	case FS_IOC_FSSETXATTR:
+		ret = bch2_ioc_fssetxattr(c, file, inode,
+					  (void __user *) arg);
+		break;
+
+	case BCHFS_IOC_REINHERIT_ATTRS:
+		ret = bch2_ioc_reinherit_attrs(c, file, inode,
+					       (void __user *) arg);
+		break;
+
+	case FS_IOC_GETVERSION:
+		ret = -ENOTTY;
+		break;
+
+	case FS_IOC_SETVERSION:
+		ret = -ENOTTY;
+		break;
+
+	case FS_IOC_GOINGDOWN:
+		ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
+		break;
+
+	case BCH_IOCTL_SUBVOLUME_CREATE: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_create(c, file, i);
+		break;
+	}
+
+	case BCH_IOCTL_SUBVOLUME_DESTROY: {
+		struct bch_ioctl_subvolume i;
+
+		ret = copy_from_user(&i, (void __user *) arg, sizeof(i))
+			? -EFAULT
+			: bch2_ioctl_subvolume_destroy(c, file, i);
+		break;
+	}
+
+	default:
+		ret = bch2_fs_ioctl(c, cmd, (void __user *) arg);
+		break;
+	}
+
+	return bch2_err_class(ret);
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
new file mode 100644
index 000000000000..d30f9bb056fd
--- /dev/null
+++ b/fs/bcachefs/fs-ioctl.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_sync]	= S_SYNC,
+	[__BCH_INODE_immutable]	= S_IMMUTABLE,
+	[__BCH_INODE_append]	= S_APPEND,
+	[__BCH_INODE_noatime]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_sync]	= FS_SYNC_FL,
+	[__BCH_INODE_immutable]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_append]	= FS_APPEND_FL,
+	[__BCH_INODE_nodump]	= FS_NODUMP_FL,
+	[__BCH_INODE_noatime]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_sync]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_immutable]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_append]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_nodump]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_noatime]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
new file mode 100644
index 000000000000..49da8db1d9e9
--- /dev/null
+++ b/fs/bcachefs/fs.c
@@ -0,0 +1,2010 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "dirent.h"
+#include "errcode.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-common.h"
+#include "fs-io.h"
+#include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "journal.h"
+#include "keylist.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/aio.h>
+#include <linux/backing-dev.h>
+#include <linux/exportfs.h>
+#include <linux/fiemap.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+
+static struct kmem_cache *bch2_inode_cache;
+
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *,
+				struct bch_subvolume *);
+
+void bch2_inode_update_after_write(struct btree_trans *trans,
+				   struct bch_inode_info *inode,
+				   struct bch_inode_unpacked *bi,
+				   unsigned fields)
+{
+	struct bch_fs *c = trans->c;
+
+	BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+			       POS(0, bi->bi_inum),
+			       c->opts.inodes_use_key_cache);
+
+	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
+	i_uid_write(&inode->v, bi->bi_uid);
+	i_gid_write(&inode->v, bi->bi_gid);
+	inode->v.i_mode	= bi->bi_mode;
+
+	if (fields & ATTR_ATIME)
+		inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
+	if (fields & ATTR_MTIME)
+		inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
+	if (fields & ATTR_CTIME)
+		inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
+
+	inode->ei_inode		= *bi;
+
+	bch2_inode_flags_to_vfs(inode);
+}
+
+int __must_check bch2_write_inode(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  inode_set_fn set,
+				  void *p, unsigned fields)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
+				BTREE_ITER_INTENT) ?:
+		(set ? set(trans, inode, &inode_u, p) : 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+
+	/*
+	 * the btree node lock protects inode->ei_inode, not ei_update_lock;
+	 * this is important for inode updates via bchfs_write_index_update
+	 */
+	if (!ret)
+		bch2_inode_update_after_write(trans, inode, &inode_u, fields);
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
+			     "inode %u:%llu not found when updating",
+			     inode_inum(inode).subvol,
+			     inode_inum(inode).inum);
+
+	bch2_trans_put(trans);
+	return ret < 0 ? ret : 0;
+}
+
+int bch2_fs_quota_transfer(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   struct bch_qid new_qid,
+			   unsigned qtypes,
+			   enum quota_acct_mode mode)
+{
+	unsigned i;
+	int ret;
+
+	qtypes &= enabled_qtypes(c);
+
+	for (i = 0; i < QTYP_NR; i++)
+		if (new_qid.q[i] == inode->ei_qid.q[i])
+			qtypes &= ~(1U << i);
+
+	if (!qtypes)
+		return 0;
+
+	mutex_lock(&inode->ei_quota_lock);
+
+	ret = bch2_quota_transfer(c, qtypes, new_qid,
+				  inode->ei_qid,
+				  inode->v.i_blocks +
+				  inode->ei_quota_reserved,
+				  mode);
+	if (!ret)
+		for (i = 0; i < QTYP_NR; i++)
+			if (qtypes & (1 << i))
+				inode->ei_qid.q[i] = new_qid.q[i];
+
+	mutex_unlock(&inode->ei_quota_lock);
+
+	return ret;
+}
+
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	return inode->ei_subvol == inum->subvol &&
+		inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	subvol_inum *inum = p;
+
+	inode->v.i_ino		= inum->inum;
+	inode->ei_subvol	= inum->subvol;
+	inode->ei_inode.bi_inum	= inum->inum;
+	return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_unpacked inode_u;
+	struct bch_inode_info *inode;
+	struct btree_trans *trans;
+	struct bch_subvolume subvol;
+	int ret;
+
+	inode = to_bch_ei(iget5_locked(c->vfs_sb,
+				       bch2_inode_hash(inum),
+				       bch2_iget5_test,
+				       bch2_iget5_set,
+				       &inum));
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->v.i_state & I_NEW))
+		return &inode->v;
+
+	trans = bch2_trans_get(c);
+	ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+
+	if (!ret)
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	bch2_trans_put(trans);
+
+	if (ret) {
+		iget_failed(&inode->v);
+		return ERR_PTR(bch2_err_class(ret));
+	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+
+	unlock_new_inode(&inode->v);
+
+	return &inode->v;
+}
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *idmap,
+	      struct bch_inode_info *dir, struct dentry *dentry,
+	      umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+	      unsigned flags)
+{
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct bch_inode_unpacked dir_u;
+	struct bch_inode_info *inode, *old;
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *default_acl = NULL, *acl = NULL;
+	subvol_inum inum;
+	struct bch_subvolume subvol;
+	u64 journal_seq = 0;
+	int ret;
+
+	/*
+	 * preallocate acls + vfs inode before btree transaction, so that
+	 * nothing can fail after the transaction succeeds:
+	 */
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
+	if (ret)
+		return ERR_PTR(ret);
+#endif
+	inode = to_bch_ei(new_inode(c->vfs_sb));
+	if (unlikely(!inode)) {
+		inode = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	bch2_inode_init_early(c, &inode_u);
+
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_lock(&dir->ei_update_lock);
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+		bch2_create_trans(trans,
+				  inode_inum(dir), &dir_u, &inode_u,
+				  !(flags & BCH_CREATE_TMPFILE)
+				  ? &dentry->d_name : NULL,
+				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
+				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
+				  mode, rdev,
+				  default_acl, acl, snapshot_src, flags) ?:
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
+				KEY_TYPE_QUOTA_PREALLOC);
+	if (unlikely(ret))
+		goto err_before_quota;
+
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+	inum.inum = inode_u.bi_inum;
+
+	ret   = bch2_subvolume_get(trans, inum.subvol, true,
+				   BTREE_ITER_WITH_UPDATES, &subvol) ?:
+		bch2_trans_commit(trans, NULL, &journal_seq, 0);
+	if (unlikely(ret)) {
+		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+err_before_quota:
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+		goto err_trans;
+	}
+
+	if (!(flags & BCH_CREATE_TMPFILE)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		mutex_unlock(&dir->ei_update_lock);
+	}
+
+	bch2_iget5_set(&inode->v, &inum);
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+
+	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+	set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+	/*
+	 * we must insert the new inode into the inode cache before calling
+	 * bch2_trans_exit() and dropping locks, else we could race with another
+	 * thread pulling the inode in and modifying it:
+	 */
+
+	inode->v.i_state |= I_CREATING;
+
+	old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
+	BUG_ON(!old);
+
+	if (unlikely(old != inode)) {
+		/*
+		 * We raced, another process pulled the new inode into cache
+		 * before us:
+		 */
+		make_bad_inode(&inode->v);
+		iput(&inode->v);
+
+		inode = old;
+	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
+
+	bch2_trans_put(trans);
+err:
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
+	return inode;
+err_trans:
+	if (!(flags & BCH_CREATE_TMPFILE))
+		mutex_unlock(&dir->ei_update_lock);
+
+	bch2_trans_put(trans);
+	make_bad_inode(&inode->v);
+	iput(&inode->v);
+	inode = ERR_PTR(ret);
+	goto err;
+}
+
+/* methods */
+
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
+	struct inode *vinode = NULL;
+	subvol_inum inum = { .subvol = 1 };
+	int ret;
+
+	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+				 &dentry->d_name, &inum);
+
+	if (!ret)
+		vinode = bch2_vfs_inode_get(c, inum);
+
+	return d_splice_alias(vinode, dentry);
+}
+
+static int bch2_mknod(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev,
+			      (subvol_inum) { 0 }, 0);
+
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+static int bch2_create(struct mnt_idmap *idmap,
+		       struct inode *vdir, struct dentry *dentry,
+		       umode_t mode, bool excl)
+{
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0);
+}
+
+static int __bch2_link(struct bch_fs *c,
+		       struct bch_inode_info *inode,
+		       struct bch_inode_info *dir,
+		       struct dentry *dentry)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bch_inode_unpacked dir_u, inode_u;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_link_trans(trans,
+					inode_inum(dir),   &dir_u,
+					inode_inum(inode), &inode_u,
+					&dentry->d_name));
+
+	if (likely(!ret)) {
+		bch2_inode_update_after_write(trans, dir, &dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+		bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
+	}
+
+	bch2_trans_put(trans);
+	mutex_unlock(&inode->ei_update_lock);
+	return ret;
+}
+
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
+		     struct dentry *dentry)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+		__bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		return ret;
+
+	ihold(&inode->v);
+	d_instantiate(dentry, &inode->v);
+	return 0;
+}
+
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+		  bool deleting_snapshot)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir);
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_inode_unpacked dir_u, inode_u;
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret;
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+
+	ret = commit_do(trans, NULL, NULL,
+			BTREE_INSERT_NOFAIL,
+		bch2_unlink_trans(trans,
+				  inode_inum(dir), &dir_u,
+				  &inode_u, &dentry->d_name,
+				  deleting_snapshot));
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(trans, dir, &dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+	bch2_inode_update_after_write(trans, inode, &inode_u,
+				      ATTR_MTIME);
+
+	if (inode_u.bi_subvol) {
+		/*
+		 * Subvolume deletion is asynchronous, but we still want to tell
+		 * the VFS that it's been deleted here:
+		 */
+		set_nlink(&inode->v, 0);
+	}
+err:
+	bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+	struct bch_inode_info *dir= to_bch_ei(vdir);
+	struct bch_fs *c = dir->v.i_sb->s_fs_info;
+
+	return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+		__bch2_unlink(vdir, dentry, false);
+}
+
+static int bch2_symlink(struct mnt_idmap *idmap,
+			struct inode *vdir, struct dentry *dentry,
+			const char *symname)
+{
+	struct bch_fs *c = vdir->i_sb->s_fs_info;
+	struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
+	int ret;
+
+	inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	inode_lock(&inode->v);
+	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
+	inode_unlock(&inode->v);
+
+	if (unlikely(ret))
+		goto err;
+
+	ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
+	if (unlikely(ret))
+		goto err;
+
+	ret = __bch2_link(c, inode, dir, dentry);
+	if (unlikely(ret))
+		goto err;
+
+	d_instantiate(dentry, &inode->v);
+	return 0;
+err:
+	iput(&inode->v);
+	return ret;
+}
+
+static int bch2_mkdir(struct mnt_idmap *idmap,
+		      struct inode *vdir, struct dentry *dentry, umode_t mode)
+{
+	return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+}
+
+static int bch2_rename2(struct mnt_idmap *idmap,
+			struct inode *src_vdir, struct dentry *src_dentry,
+			struct inode *dst_vdir, struct dentry *dst_dentry,
+			unsigned flags)
+{
+	struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+	struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
+	struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
+	struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
+	struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
+	struct bch_inode_unpacked dst_dir_u, src_dir_u;
+	struct bch_inode_unpacked src_inode_u, dst_inode_u;
+	struct btree_trans *trans;
+	enum bch_rename_mode mode = flags & RENAME_EXCHANGE
+		? BCH_RENAME_EXCHANGE
+		: dst_dentry->d_inode
+		? BCH_RENAME_OVERWRITE : BCH_RENAME;
+	int ret;
+
+	if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (mode == BCH_RENAME_OVERWRITE) {
+		ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
+						   0, LLONG_MAX);
+		if (ret)
+			return ret;
+	}
+
+	trans = bch2_trans_get(c);
+
+	bch2_lock_inodes(INODE_UPDATE_LOCK,
+			 src_dir,
+			 dst_dir,
+			 src_inode,
+			 dst_inode);
+
+	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
+		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+	if (ret)
+		goto err;
+
+	if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, src_inode,
+					     dst_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
+		ret = bch2_fs_quota_transfer(c, dst_inode,
+					     src_dir->ei_qid,
+					     1 << QTYP_PRJ,
+					     KEY_TYPE_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	ret = commit_do(trans, NULL, NULL, 0,
+			bch2_rename_trans(trans,
+					  inode_inum(src_dir), &src_dir_u,
+					  inode_inum(dst_dir), &dst_dir_u,
+					  &src_inode_u,
+					  &dst_inode_u,
+					  &src_dentry->d_name,
+					  &dst_dentry->d_name,
+					  mode));
+	if (unlikely(ret))
+		goto err;
+
+	BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
+	BUG_ON(dst_inode &&
+	       dst_inode->v.i_ino != dst_inode_u.bi_inum);
+
+	bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
+				      ATTR_MTIME|ATTR_CTIME);
+
+	if (src_dir != dst_dir)
+		bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
+					      ATTR_MTIME|ATTR_CTIME);
+
+	bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
+				      ATTR_CTIME);
+
+	if (dst_inode)
+		bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
+					      ATTR_CTIME);
+err:
+	bch2_trans_put(trans);
+
+	bch2_fs_quota_transfer(c, src_inode,
+			       bch_qid(&src_inode->ei_inode),
+			       1 << QTYP_PRJ,
+			       KEY_TYPE_QUOTA_NOCHECK);
+	if (dst_inode)
+		bch2_fs_quota_transfer(c, dst_inode,
+				       bch_qid(&dst_inode->ei_inode),
+				       1 << QTYP_PRJ,
+				       KEY_TYPE_QUOTA_NOCHECK);
+
+	bch2_unlock_inodes(INODE_UPDATE_LOCK,
+			   src_dir,
+			   dst_dir,
+			   src_inode,
+			   dst_inode);
+
+	return ret;
+}
+
+static void bch2_setattr_copy(struct mnt_idmap *idmap,
+			      struct bch_inode_info *inode,
+			      struct bch_inode_unpacked *bi,
+			      struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	unsigned int ia_valid = attr->ia_valid;
+
+	if (ia_valid & ATTR_UID)
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+	if (ia_valid & ATTR_GID)
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+	if (ia_valid & ATTR_SIZE)
+		bi->bi_size = attr->ia_size;
+
+	if (ia_valid & ATTR_ATIME)
+		bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+	if (ia_valid & ATTR_MTIME)
+		bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+	if (ia_valid & ATTR_CTIME)
+		bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = attr->ia_mode;
+		kgid_t gid = ia_valid & ATTR_GID
+			? attr->ia_gid
+			: inode->v.i_gid;
+
+		if (!in_group_p(gid) &&
+		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+			mode &= ~S_ISGID;
+		bi->bi_mode = mode;
+	}
+}
+
+int bch2_setattr_nonsize(struct mnt_idmap *idmap,
+			 struct bch_inode_info *inode,
+			 struct iattr *attr)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_qid qid;
+	struct btree_trans *trans;
+	struct btree_iter inode_iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	struct posix_acl *acl = NULL;
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+
+	qid = inode->ei_qid;
+
+	if (attr->ia_valid & ATTR_UID)
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+
+	if (attr->ia_valid & ATTR_GID)
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+
+	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
+				     KEY_TYPE_QUOTA_PREALLOC);
+	if (ret)
+		goto err;
+
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+	kfree(acl);
+	acl = NULL;
+
+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
+			      BTREE_ITER_INTENT);
+	if (ret)
+		goto btree_err;
+
+	bch2_setattr_copy(idmap, inode, &inode_u, attr);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
+				     inode_u.bi_mode, &acl);
+		if (ret)
+			goto btree_err;
+	}
+
+	ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL);
+btree_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+	if (unlikely(ret))
+		goto err_trans;
+
+	bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
+
+	if (acl)
+		set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+	bch2_trans_put(trans);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_getattr(struct mnt_idmap *idmap,
+			const struct path *path, struct kstat *stat,
+			u32 request_mask, unsigned query_flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	stat->dev	= inode->v.i_sb->s_dev;
+	stat->ino	= inode->v.i_ino;
+	stat->mode	= inode->v.i_mode;
+	stat->nlink	= inode->v.i_nlink;
+	stat->uid	= inode->v.i_uid;
+	stat->gid	= inode->v.i_gid;
+	stat->rdev	= inode->v.i_rdev;
+	stat->size	= i_size_read(&inode->v);
+	stat->atime	= inode_get_atime(&inode->v);
+	stat->mtime	= inode_get_mtime(&inode->v);
+	stat->ctime	= inode_get_ctime(&inode->v);
+	stat->blksize	= block_bytes(c);
+	stat->blocks	= inode->v.i_blocks;
+
+	if (request_mask & STATX_BTIME) {
+		stat->result_mask |= STATX_BTIME;
+		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+	}
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	stat->attributes_mask	 |= STATX_ATTR_IMMUTABLE;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_append)
+		stat->attributes |= STATX_ATTR_APPEND;
+	stat->attributes_mask	 |= STATX_ATTR_APPEND;
+
+	if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
+		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask	 |= STATX_ATTR_NODUMP;
+
+	return 0;
+}
+
+static int bch2_setattr(struct mnt_idmap *idmap,
+			struct dentry *dentry, struct iattr *iattr)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
+
+	lockdep_assert_held(&inode->v.i_rwsem);
+
+	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+		setattr_prepare(idmap, dentry, iattr);
+	if (ret)
+		return ret;
+
+	return iattr->ia_valid & ATTR_SIZE
+		? bchfs_truncate(idmap, inode, iattr)
+		: bch2_setattr_nonsize(idmap, inode, iattr);
+}
+
+static int bch2_tmpfile(struct mnt_idmap *idmap,
+			struct inode *vdir, struct file *file, umode_t mode)
+{
+	struct bch_inode_info *inode =
+		__bch2_create(idmap, to_bch_ei(vdir),
+			      file->f_path.dentry, mode, 0,
+			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
+
+	if (IS_ERR(inode))
+		return bch2_err_class(PTR_ERR(inode));
+
+	d_mark_tmpfile(file, &inode->v);
+	d_instantiate(file->f_path.dentry, &inode->v);
+	return finish_open_simple(file, 0);
+}
+
+static int bch2_fill_extent(struct bch_fs *c,
+			    struct fiemap_extent_info *info,
+			    struct bkey_s_c k, unsigned flags)
+{
+	if (bkey_extent_is_direct_data(k.k)) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+		int ret;
+
+		if (k.k->type == KEY_TYPE_reflink_v)
+			flags |= FIEMAP_EXTENT_SHARED;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			int flags2 = 0;
+			u64 offset = p.ptr.offset;
+
+			if (p.ptr.unwritten)
+				flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
+			if (p.crc.compression_type)
+				flags2 |= FIEMAP_EXTENT_ENCODED;
+			else
+				offset += p.crc.offset;
+
+			if ((offset & (block_sectors(c) - 1)) ||
+			    (k.k->size & (block_sectors(c) - 1)))
+				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+			ret = fiemap_fill_next_extent(info,
+						bkey_start_offset(k.k) << 9,
+						offset << 9,
+						k.k->size << 9, flags|flags2);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	} else if (bkey_extent_is_inline_data(k.k)) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DATA_INLINE);
+	} else if (k.k->type == KEY_TYPE_reservation) {
+		return fiemap_fill_next_extent(info,
+					       bkey_start_offset(k.k) << 9,
+					       0, k.k->size << 9,
+					       flags|
+					       FIEMAP_EXTENT_DELALLOC|
+					       FIEMAP_EXTENT_UNWRITTEN);
+	} else {
+		BUG();
+	}
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+		       u64 start, u64 len)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *ei = to_bch_ei(vinode);
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_buf cur, prev;
+	struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
+	unsigned offset_into_extent, sectors;
+	bool have_extent = false;
+	u32 snapshot;
+	int ret = 0;
+
+	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	if (start + len < start)
+		return -EINVAL;
+
+	start >>= 9;
+
+	bch2_bkey_buf_init(&cur);
+	bch2_bkey_buf_init(&prev);
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(ei->v.i_ino, start, snapshot), 0);
+
+	while (!(ret = btree_trans_too_many_iters(trans)) &&
+	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+	       !(ret = bkey_err(k))) {
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		if (!bkey_extent_is_data(k.k) &&
+		    k.k->type != KEY_TYPE_reservation) {
+			bch2_btree_iter_advance(&iter);
+			continue;
+		}
+
+		offset_into_extent	= iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors			= k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&cur, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &cur);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(cur.k);
+		bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bch2_cut_front(POS(k.k->p.inode,
+				   bkey_start_offset(k.k) +
+				   offset_into_extent),
+			       cur.k);
+		bch2_key_resize(&cur.k->k, sectors);
+		cur.k->k.p = iter.pos;
+		cur.k->k.p.offset += cur.k->k.size;
+
+		if (have_extent) {
+			bch2_trans_unlock(trans);
+			ret = bch2_fill_extent(c, info,
+					bkey_i_to_s_c(prev.k), 0);
+			if (ret)
+				break;
+		}
+
+		bkey_copy(prev.k, cur.k);
+		have_extent = true;
+
+		bch2_btree_iter_set_pos(&iter,
+			POS(iter.pos.inode, iter.pos.offset + sectors));
+	}
+	start = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (!ret && have_extent) {
+		bch2_trans_unlock(trans);
+		ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
+				       FIEMAP_EXTENT_LAST);
+	}
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&cur, c);
+	bch2_bkey_buf_exit(&prev, c);
+	return ret < 0 ? ret : 0;
+}
+
+static const struct vm_operations_struct bch_vm_ops = {
+	.fault		= bch2_page_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = bch2_page_mkwrite,
+};
+
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+
+	vma->vm_ops = &bch_vm_ops;
+	return 0;
+}
+
+/* Directories: */
+
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_file_llseek_size(file, offset, whence,
+					S64_MAX, S64_MAX);
+}
+
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	ret = bch2_readdir(c, inode_inum(inode), ctx);
+	if (ret)
+		bch_err_fn(c, ret);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_open(struct inode *vinode, struct file *file)
+{
+	if (file->f_flags & (O_WRONLY|O_RDWR)) {
+		struct bch_inode_info *inode = to_bch_ei(vinode);
+		struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+		if (ret)
+			return ret;
+	}
+
+	return generic_file_open(vinode, file);
+}
+
+static const struct file_operations bch_file_operations = {
+	.open		= bch2_open,
+	.llseek		= bch2_llseek,
+	.read_iter	= bch2_read_iter,
+	.write_iter	= bch2_write_iter,
+	.mmap		= bch2_mmap,
+	.fsync		= bch2_fsync,
+	.splice_read	= filemap_splice_read,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= bch2_fallocate_dispatch,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+	.remap_file_range = bch2_remap_file_range,
+};
+
+static const struct inode_operations bch_file_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.fiemap		= bch2_fiemap,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_dir_inode_operations = {
+	.lookup		= bch2_lookup,
+	.create		= bch2_create,
+	.link		= bch2_link,
+	.unlink		= bch2_unlink,
+	.symlink	= bch2_symlink,
+	.mkdir		= bch2_mkdir,
+	.rmdir		= bch2_unlink,
+	.mknod		= bch2_mknod,
+	.rename		= bch2_rename2,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.tmpfile	= bch2_tmpfile,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct file_operations bch_dir_file_operations = {
+	.llseek		= bch2_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= bch2_vfs_readdir,
+	.fsync		= bch2_fsync,
+	.unlocked_ioctl = bch2_fs_file_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bch2_compat_fs_ioctl,
+#endif
+};
+
+static const struct inode_operations bch_symlink_inode_operations = {
+	.get_link	= page_get_link,
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct inode_operations bch_special_inode_operations = {
+	.getattr	= bch2_getattr,
+	.setattr	= bch2_setattr,
+	.listxattr	= bch2_xattr_list,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	.get_acl	= bch2_get_acl,
+	.set_acl	= bch2_set_acl,
+#endif
+};
+
+static const struct address_space_operations bch_address_space_operations = {
+	.read_folio	= bch2_read_folio,
+	.writepages	= bch2_writepages,
+	.readahead	= bch2_readahead,
+	.dirty_folio	= filemap_dirty_folio,
+	.write_begin	= bch2_write_begin,
+	.write_end	= bch2_write_end,
+	.invalidate_folio = bch2_invalidate_folio,
+	.release_folio	= bch2_release_folio,
+	.direct_IO	= noop_direct_IO,
+#ifdef CONFIG_MIGRATION
+	.migrate_folio	= filemap_migrate_folio,
+#endif
+	.error_remove_page = generic_error_remove_page,
+};
+
+struct bcachefs_fid {
+	u64		inum;
+	u32		subvol;
+	u32		gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+	struct bcachefs_fid	fid;
+	struct bcachefs_fid	dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
+{
+	switch (fh_type) {
+	case FILEID_BCACHEFS_WITHOUT_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+	case FILEID_BCACHEFS_WITH_PARENT:
+		return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+	default:
+		return false;
+	}
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+	return (struct bcachefs_fid) {
+		.inum	= inode->ei_inode.bi_inum,
+		.subvol	= inode->ei_subvol,
+		.gen	= inode->ei_inode.bi_generation,
+	};
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+			  struct inode *vdir)
+{
+	struct bch_inode_info *inode	= to_bch_ei(vinode);
+	struct bch_inode_info *dir	= to_bch_ei(vdir);
+	int min_len;
+
+	if (!S_ISDIR(inode->v.i_mode) && dir) {
+		struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+		min_len = sizeof(*fid) / sizeof(u32);
+		if (*len < min_len) {
+			*len = min_len;
+			return FILEID_INVALID;
+		}
+
+		fid->fid = bch2_inode_to_fid(inode);
+		fid->dir = bch2_inode_to_fid(dir);
+
+		*len = min_len;
+		return FILEID_BCACHEFS_WITH_PARENT;
+	} else {
+		struct bcachefs_fid *fid = (void *) fh;
+
+		min_len = sizeof(*fid) / sizeof(u32);
+		if (*len < min_len) {
+			*len = min_len;
+			return FILEID_INVALID;
+		}
+		*fid = bch2_inode_to_fid(inode);
+
+		*len = min_len;
+		return FILEID_BCACHEFS_WITHOUT_PARENT;
+	}
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+					struct bcachefs_fid fid)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+				    .subvol = fid.subvol,
+				    .inum = fid.inum,
+	});
+	if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
+		iput(vinode);
+		vinode = ERR_PTR(-ESTALE);
+	}
+	return vinode;
+}
+
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
+		int fh_len, int fh_type)
+{
+	struct bcachefs_fid *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type))
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
+}
+
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
+		int fh_len, int fh_type)
+{
+	struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+	if (!bcachefs_fid_valid(fh_len, fh_type) ||
+	    fh_type != FILEID_BCACHEFS_WITH_PARENT)
+		return NULL;
+
+	return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+	struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	subvol_inum parent_inum = {
+		.subvol = inode->ei_inode.bi_parent_subvol ?:
+			inode->ei_subvol,
+		.inum = inode->ei_inode.bi_dir,
+	};
+
+	return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+	struct bch_inode_info *inode	= to_bch_ei(child->d_inode);
+	struct bch_inode_info *dir	= to_bch_ei(parent->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans *trans;
+	struct btree_iter iter1;
+	struct btree_iter iter2;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	struct bch_inode_unpacked inode_u;
+	subvol_inum target;
+	u32 snapshot;
+	struct qstr dirent_name;
+	unsigned name_len = 0;
+	int ret;
+
+	if (!S_ISDIR(dir->v.i_mode))
+		return -EINVAL;
+
+	trans = bch2_trans_get(c);
+
+	bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+	bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
+			     POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_btree_iter_set_snapshot(&iter1, snapshot);
+	bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+	ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
+	if (ret)
+		goto err;
+
+	if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+		bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+		k = bch2_btree_iter_peek_slot(&iter1);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (k.k->type != KEY_TYPE_dirent) {
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+			goto err;
+		}
+
+		d = bkey_s_c_to_dirent(k);
+		ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+		if (ret > 0)
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+		if (ret)
+			goto err;
+
+		if (target.subvol	== inode->ei_subvol &&
+		    target.inum		== inode->ei_inode.bi_inum)
+			goto found;
+	} else {
+		/*
+		 * File with multiple hardlinks and our backref is to the wrong
+		 * directory - linear search:
+		 */
+		for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+			if (k.k->p.inode > dir->ei_inode.bi_inum)
+				break;
+
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
+
+			d = bkey_s_c_to_dirent(k);
+			ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
+			if (ret < 0)
+				break;
+			if (ret)
+				continue;
+
+			if (target.subvol	== inode->ei_subvol &&
+			    target.inum		== inode->ei_inode.bi_inum)
+				goto found;
+		}
+	}
+
+	ret = -ENOENT;
+	goto err;
+found:
+	dirent_name = bch2_dirent_get_name(d);
+
+	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+	memcpy(name, dirent_name.name, name_len);
+	name[name_len] = '\0';
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_iter_exit(trans, &iter1);
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static const struct export_operations bch_export_ops = {
+	.encode_fh	= bch2_encode_fh,
+	.fh_to_dentry	= bch2_fh_to_dentry,
+	.fh_to_parent	= bch2_fh_to_parent,
+	.get_parent	= bch2_get_parent,
+	.get_name	= bch2_get_name,
+};
+
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+				struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				struct bch_subvolume *subvol)
+{
+	bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+	if (BCH_SUBVOLUME_SNAP(subvol))
+		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+	else
+		clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+
+	inode->v.i_blocks	= bi->bi_sectors;
+	inode->v.i_ino		= bi->bi_inum;
+	inode->v.i_rdev		= bi->bi_dev;
+	inode->v.i_generation	= bi->bi_generation;
+	inode->v.i_size		= bi->bi_size;
+
+	inode->ei_flags		= 0;
+	inode->ei_quota_reserved = 0;
+	inode->ei_qid		= bch_qid(bi);
+	inode->ei_subvol	= inum.subvol;
+
+	inode->v.i_mapping->a_ops = &bch_address_space_operations;
+
+	switch (inode->v.i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->v.i_op	= &bch_file_inode_operations;
+		inode->v.i_fop	= &bch_file_operations;
+		break;
+	case S_IFDIR:
+		inode->v.i_op	= &bch_dir_inode_operations;
+		inode->v.i_fop	= &bch_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode_nohighmem(&inode->v);
+		inode->v.i_op	= &bch_symlink_inode_operations;
+		break;
+	default:
+		init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
+		inode->v.i_op	= &bch_special_inode_operations;
+		break;
+	}
+
+	mapping_set_large_folios(inode->v.i_mapping);
+}
+
+static struct inode *bch2_alloc_inode(struct super_block *sb)
+{
+	struct bch_inode_info *inode;
+
+	inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+	if (!inode)
+		return NULL;
+
+	inode_init_once(&inode->v);
+	mutex_init(&inode->ei_update_lock);
+	two_state_lock_init(&inode->ei_pagecache_lock);
+	INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+	mutex_init(&inode->ei_quota_lock);
+
+	return &inode->v;
+}
+
+static void bch2_i_callback(struct rcu_head *head)
+{
+	struct inode *vinode = container_of(head, struct inode, i_rcu);
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	kmem_cache_free(bch2_inode_cache, inode);
+}
+
+static void bch2_destroy_inode(struct inode *vinode)
+{
+	call_rcu(&vinode->i_rcu, bch2_i_callback);
+}
+
+static int inode_update_times_fn(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 struct bch_inode_unpacked *bi,
+				 void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+	bi->bi_atime	= timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+	bi->bi_mtime	= timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+	bi->bi_ctime	= timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
+
+	return 0;
+}
+
+static int bch2_vfs_write_inode(struct inode *vinode,
+				struct writeback_control *wbc)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	int ret;
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+			       ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+	mutex_unlock(&inode->ei_update_lock);
+
+	return bch2_err_class(ret);
+}
+
+static void bch2_evict_inode(struct inode *vinode)
+{
+	struct bch_fs *c = vinode->i_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+
+	truncate_inode_pages_final(&inode->v.i_data);
+
+	clear_inode(&inode->v);
+
+	BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
+	if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+				KEY_TYPE_QUOTA_WARN);
+		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+				KEY_TYPE_QUOTA_WARN);
+		bch2_inode_rm(c, inode_inum(inode));
+	}
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_del_init(&inode->ei_vfs_inode_list);
+	mutex_unlock(&c->vfs_inodes_lock);
+}
+
+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
+{
+	struct bch_inode_info *inode, **i;
+	DARRAY(struct bch_inode_info *) grabbed;
+	bool clean_pass = false, this_pass_clean;
+
+	/*
+	 * Initially, we scan for inodes without I_DONTCACHE, then mark them to
+	 * be pruned with d_mark_dontcache().
+	 *
+	 * Once we've had a clean pass where we didn't find any inodes without
+	 * I_DONTCACHE, we wait for them to be freed:
+	 */
+
+	darray_init(&grabbed);
+	darray_make_room(&grabbed, 1024);
+again:
+	cond_resched();
+	this_pass_clean = true;
+
+	mutex_lock(&c->vfs_inodes_lock);
+	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
+		if (!snapshot_list_has_id(s, inode->ei_subvol))
+			continue;
+
+		if (!(inode->v.i_state & I_DONTCACHE) &&
+		    !(inode->v.i_state & I_FREEING) &&
+		    igrab(&inode->v)) {
+			this_pass_clean = false;
+
+			if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) {
+				iput(&inode->v);
+				break;
+			}
+		} else if (clean_pass && this_pass_clean) {
+			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
+			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+
+			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			mutex_unlock(&c->vfs_inodes_lock);
+
+			schedule();
+			finish_wait(wq, &wait.wq_entry);
+			goto again;
+		}
+	}
+	mutex_unlock(&c->vfs_inodes_lock);
+
+	darray_for_each(grabbed, i) {
+		inode = *i;
+		d_mark_dontcache(&inode->v);
+		d_prune_aliases(&inode->v);
+		iput(&inode->v);
+	}
+	grabbed.nr = 0;
+
+	if (!clean_pass || !this_pass_clean) {
+		clean_pass = this_pass_clean;
+		goto again;
+	}
+
+	darray_exit(&grabbed);
+}
+
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
+	unsigned shift = sb->s_blocksize_bits - 9;
+	/*
+	 * this assumes inodes take up 64 bytes, which is a decent average
+	 * number:
+	 */
+	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
+	u64 fsid;
+
+	buf->f_type	= BCACHEFS_STATFS_MAGIC;
+	buf->f_bsize	= sb->s_blocksize;
+	buf->f_blocks	= usage.capacity >> shift;
+	buf->f_bfree	= usage.free >> shift;
+	buf->f_bavail	= avail_factor(usage.free) >> shift;
+
+	buf->f_files	= usage.nr_inodes + avail_inodes;
+	buf->f_ffree	= avail_inodes;
+
+	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
+	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_namelen	= BCH_NAME_MAX;
+
+	return 0;
+}
+
+static int bch2_sync_fs(struct super_block *sb, int wait)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (c->opts.journal_flush_disabled)
+		return 0;
+
+	if (!wait) {
+		bch2_journal_flush_async(&c->journal, NULL);
+		return 0;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	return bch2_err_class(ret);
+}
+
+static struct bch_fs *bch2_path_to_fs(const char *path)
+{
+	struct bch_fs *c;
+	dev_t dev;
+	int ret;
+
+	ret = lookup_bdev(path, &dev);
+	if (ret)
+		return ERR_PTR(ret);
+
+	c = bch2_dev_to_fs(dev);
+	if (c)
+		closure_put(&c->cl);
+	return c ?: ERR_PTR(-ENOENT);
+}
+
+static char **split_devs(const char *_dev_name, unsigned *nr)
+{
+	char *dev_name = NULL, **devs = NULL, *s;
+	size_t i = 0, nr_devs = 0;
+
+	dev_name = kstrdup(_dev_name, GFP_KERNEL);
+	if (!dev_name)
+		return NULL;
+
+	for (s = dev_name; s; s = strchr(s + 1, ':'))
+		nr_devs++;
+
+	devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
+	if (!devs) {
+		kfree(dev_name);
+		return NULL;
+	}
+
+	while ((s = strsep(&dev_name, ":")))
+		devs[i++] = s;
+
+	*nr = nr_devs;
+	return devs;
+}
+
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_opts opts = bch2_opts_empty();
+	int ret;
+
+	opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(c, &opts, data);
+	if (ret)
+		goto err;
+
+	if (opts.read_only != c->opts.read_only) {
+		down_write(&c->state_lock);
+
+		if (opts.read_only) {
+			bch2_fs_read_only(c);
+
+			sb->s_flags |= SB_RDONLY;
+		} else {
+			ret = bch2_fs_read_write(c);
+			if (ret) {
+				bch_err(c, "error going rw: %i", ret);
+				up_write(&c->state_lock);
+				ret = -EINVAL;
+				goto err;
+			}
+
+			sb->s_flags &= ~SB_RDONLY;
+		}
+
+		c->opts.read_only = opts.read_only;
+
+		up_write(&c->state_lock);
+	}
+
+	if (opt_defined(opts, errors))
+		c->opts.errors = opts.errors;
+err:
+	return bch2_err_class(ret);
+}
+
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	struct bch_dev *ca;
+	unsigned i;
+	bool first = true;
+
+	for_each_online_member(ca, c, i) {
+		if (!first)
+			seq_putc(seq, ':');
+		first = false;
+		seq_puts(seq, ca->disk_sb.sb_name);
+	}
+
+	return 0;
+}
+
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct bch_fs *c = root->d_sb->s_fs_info;
+	enum bch_opt_id i;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->flags & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		printbuf_reset(&buf);
+		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
+				 OPT_SHOW_MOUNT_STYLE);
+		seq_putc(seq, ',');
+		seq_puts(seq, buf.buf);
+	}
+
+	if (buf.allocation_failure)
+		ret = -ENOMEM;
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static void bch2_put_super(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	__bch2_fs_stop(c);
+}
+
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+	return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		return 0;
+
+	down_write(&c->state_lock);
+	ret = bch2_fs_read_write(c);
+	up_write(&c->state_lock);
+	return ret;
+}
+
+static const struct super_operations bch_super_operations = {
+	.alloc_inode	= bch2_alloc_inode,
+	.destroy_inode	= bch2_destroy_inode,
+	.write_inode	= bch2_vfs_write_inode,
+	.evict_inode	= bch2_evict_inode,
+	.sync_fs	= bch2_sync_fs,
+	.statfs		= bch2_statfs,
+	.show_devname	= bch2_show_devname,
+	.show_options	= bch2_show_options,
+	.remount_fs	= bch2_remount,
+	.put_super	= bch2_put_super,
+	.freeze_fs	= bch2_freeze,
+	.unfreeze_fs	= bch2_unfreeze,
+};
+
+static int bch2_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return 0;
+}
+
+static int bch2_noset_super(struct super_block *s, void *data)
+{
+	return -EBUSY;
+}
+
+static int bch2_test_super(struct super_block *s, void *data)
+{
+	struct bch_fs *c = s->s_fs_info;
+	struct bch_fs **devs = data;
+	unsigned i;
+
+	if (!c)
+		return false;
+
+	for (i = 0; devs[i]; i++)
+		if (c != devs[i])
+			return false;
+	return true;
+}
+
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
+				 int flags, const char *dev_name, void *data)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	struct super_block *sb;
+	struct inode *vinode;
+	struct bch_opts opts = bch2_opts_empty();
+	char **devs;
+	struct bch_fs **devs_to_fs = NULL;
+	unsigned i, nr_devs;
+	int ret;
+
+	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
+
+	ret = bch2_parse_mount_opts(NULL, &opts, data);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!dev_name || strlen(dev_name) == 0)
+		return ERR_PTR(-EINVAL);
+
+	devs = split_devs(dev_name, &nr_devs);
+	if (!devs)
+		return ERR_PTR(-ENOMEM);
+
+	devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
+	if (!devs_to_fs) {
+		sb = ERR_PTR(-ENOMEM);
+		goto got_sb;
+	}
+
+	for (i = 0; i < nr_devs; i++)
+		devs_to_fs[i] = bch2_path_to_fs(devs[i]);
+
+	sb = sget(fs_type, bch2_test_super, bch2_noset_super,
+		  flags|SB_NOSEC, devs_to_fs);
+	if (!IS_ERR(sb))
+		goto got_sb;
+
+	c = bch2_fs_open(devs, nr_devs, opts);
+	if (IS_ERR(c)) {
+		sb = ERR_CAST(c);
+		goto got_sb;
+	}
+
+	/* Some options can't be parsed until after the fs is started: */
+	ret = bch2_parse_mount_opts(c, &opts, data);
+	if (ret) {
+		bch2_fs_stop(c);
+		sb = ERR_PTR(ret);
+		goto got_sb;
+	}
+
+	bch2_opts_apply(&c->opts, opts);
+
+	sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
+	if (IS_ERR(sb))
+		bch2_fs_stop(c);
+got_sb:
+	kfree(devs_to_fs);
+	kfree(devs[0]);
+	kfree(devs);
+
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
+		ret = bch2_err_class(ret);
+		return ERR_PTR(ret);
+	}
+
+	c = sb->s_fs_info;
+
+	if (sb->s_root) {
+		if ((flags ^ sb->s_flags) & SB_RDONLY) {
+			ret = -EBUSY;
+			goto err_put_super;
+		}
+		goto out;
+	}
+
+	sb->s_blocksize		= block_bytes(c);
+	sb->s_blocksize_bits	= ilog2(block_bytes(c));
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_op		= &bch_super_operations;
+	sb->s_export_op		= &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+	sb->s_qcop		= &bch2_quotactl_operations;
+	sb->s_quota_types	= QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
+	sb->s_xattr		= bch2_xattr_handlers;
+	sb->s_magic		= BCACHEFS_STATFS_MAGIC;
+	sb->s_time_gran		= c->sb.nsec_per_time_unit;
+	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
+	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	c->vfs_sb		= sb;
+	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
+
+	ret = super_setup_bdi(sb);
+	if (ret)
+		goto err_put_super;
+
+	sb->s_bdi->ra_pages		= VM_READAHEAD_PAGES;
+
+	for_each_online_member(ca, c, i) {
+		struct block_device *bdev = ca->disk_sb.bdev;
+
+		/* XXX: create an anonymous device for multi device filesystems */
+		sb->s_bdev	= bdev;
+		sb->s_dev	= bdev->bd_dev;
+		percpu_ref_put(&ca->io_ref);
+		break;
+	}
+
+	c->dev = sb->s_dev;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	if (c->opts.acl)
+		sb->s_flags	|= SB_POSIXACL;
+#endif
+
+	sb->s_shrink->seeks = 0;
+
+	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
+	ret = PTR_ERR_OR_ZERO(vinode);
+	if (ret) {
+		bch_err_msg(c, ret, "mounting: error getting root inode");
+		goto err_put_super;
+	}
+
+	sb->s_root = d_make_root(vinode);
+	if (!sb->s_root) {
+		bch_err(c, "error mounting: error allocating root dentry");
+		ret = -ENOMEM;
+		goto err_put_super;
+	}
+
+	sb->s_flags |= SB_ACTIVE;
+out:
+	return dget(sb->s_root);
+
+err_put_super:
+	deactivate_locked_super(sb);
+	return ERR_PTR(bch2_err_class(ret));
+}
+
+static void bch2_kill_sb(struct super_block *sb)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	generic_shutdown_super(sb);
+	bch2_fs_free(c);
+}
+
+static struct file_system_type bcache_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bcachefs",
+	.mount		= bch2_mount,
+	.kill_sb	= bch2_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+MODULE_ALIAS_FS("bcachefs");
+
+void bch2_vfs_exit(void)
+{
+	unregister_filesystem(&bcache_fs_type);
+	kmem_cache_destroy(bch2_inode_cache);
+}
+
+int __init bch2_vfs_init(void)
+{
+	int ret = -ENOMEM;
+
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+	if (!bch2_inode_cache)
+		goto err;
+
+	ret = register_filesystem(&bcache_fs_type);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_vfs_exit();
+	return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
new file mode 100644
index 000000000000..5edf1d4b9e6b
--- /dev/null
+++ b/fs/bcachefs/fs.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "inode.h"
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+#include "two_state_shared_lock.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+struct bch_inode_info {
+	struct inode		v;
+	struct list_head	ei_vfs_inode_list;
+	unsigned long		ei_flags;
+
+	struct mutex		ei_update_lock;
+	u64			ei_quota_reserved;
+	unsigned long		ei_last_dirtied;
+	two_state_lock_t	ei_pagecache_lock;
+
+	struct mutex		ei_quota_lock;
+	struct bch_qid		ei_qid;
+
+	u32			ei_subvol;
+
+	/*
+	 * When we've been doing nocow writes we'll need to issue flushes to the
+	 * underlying block devices
+	 *
+	 * XXX: a device may have had a flush issued by some other codepath. It
+	 * would be better to keep for each device a sequence number that's
+	 * incremented when we isusue a cache flush, and track here the sequence
+	 * number that needs flushing.
+	 */
+	struct bch_devs_mask	ei_devs_need_flush;
+
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
+};
+
+#define bch2_pagecache_add_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_tryget(i)	bch2_two_state_trylock(&i->ei_pagecache_lock, 0)
+#define bch2_pagecache_add_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 0)
+
+#define bch2_pagecache_block_put(i)	bch2_two_state_unlock(&i->ei_pagecache_lock, 1)
+#define bch2_pagecache_block_get(i)	bch2_two_state_lock(&i->ei_pagecache_lock, 1)
+
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+	return (subvol_inum) {
+		.subvol	= inode->ei_subvol,
+		.inum	= inode->ei_inode.bi_inum,
+	};
+}
+
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR			0
+
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT		1
+
+#define to_bch_ei(_inode)					\
+	container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline int ptrcmp(void *l, void *r)
+{
+	return cmp_int(l, r);
+}
+
+enum bch_inode_lock_op {
+	INODE_LOCK		= (1U << 0),
+	INODE_PAGECACHE_BLOCK	= (1U << 1),
+	INODE_UPDATE_LOCK	= (1U << 2),
+};
+
+#define bch2_lock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				down_write_nested(&a[i]->v.i_rwsem, i);	\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_get(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_lock_nested(&a[i]->ei_update_lock, i);\
+		}							\
+} while (0)
+
+#define bch2_unlock_inodes(_locks, ...)					\
+do {									\
+	struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };		\
+	unsigned i;							\
+									\
+	bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp);			\
+									\
+	for (i = 1; i < ARRAY_SIZE(a); i++)				\
+		if (a[i] != a[i - 1]) {					\
+			if ((_locks) & INODE_LOCK)			\
+				up_write(&a[i]->v.i_rwsem);		\
+			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
+				bch2_pagecache_block_put(a[i]);\
+			if ((_locks) & INODE_UPDATE_LOCK)			\
+				mutex_unlock(&a[i]->ei_update_lock);	\
+		}							\
+} while (0)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+	return to_bch_ei(file_inode(file));
+}
+
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
+				struct bch_inode_info *inode,
+				enum inode_opt_id id)
+{
+	return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
+		bch2_inode_opt_get(&dir->ei_inode, id) !=
+		bch2_inode_opt_get(&inode->ei_inode, id);
+}
+
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
+				 struct bch_inode_info *inode)
+{
+	unsigned id;
+
+	for (id = 0; id < Inode_opt_nr; id++)
+		if (inode_attr_changing(dir, inode, id))
+			return true;
+
+	return false;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+struct bch_inode_info *
+__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
+	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
+int bch2_fs_quota_transfer(struct bch_fs *,
+			   struct bch_inode_info *,
+			   struct bch_qid,
+			   unsigned,
+			   enum quota_acct_mode);
+
+static inline int bch2_set_projid(struct bch_fs *c,
+				  struct bch_inode_info *inode,
+				  u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_fs_quota_transfer(c, inode, qid,
+				      1 << QTYP_PRJ,
+				      KEY_TYPE_QUOTA_PREALLOC);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct btree_trans *,
+			    struct bch_inode_info *,
+			    struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct btree_trans *,
+				   struct bch_inode_info *,
+				   struct bch_inode_unpacked *,
+				   unsigned);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				  inode_set_fn, void *, unsigned);
+
+int bch2_setattr_nonsize(struct mnt_idmap *,
+			 struct bch_inode_info *,
+			 struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
+
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+					       snapshot_id_list *s) {}
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
new file mode 100644
index 000000000000..e0c5cd119acc
--- /dev/null
+++ b/fs/bcachefs/fsck.c
@@ -0,0 +1,2490 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "darray.h"
+#include "dirent.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "inode.h"
+#include "keylist.h"
+#include "recovery.h"
+#include "snapshot.h"
+#include "super.h"
+#include "xattr.h"
+
+#include <linux/bsearch.h>
+#include <linux/dcache.h> /* struct qstr */
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+/*
+ * XXX: this is handling transaction restarts without returning
+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
+ */
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 sectors = 0;
+	int ret;
+
+	for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret)
+		if (bkey_extent_is_allocation(k.k))
+			sectors += k.k->size;
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: sectors;
+}
+
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	u64 subdirs = 0;
+	int ret;
+
+	for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+				SPOS(inum, 0, snapshot),
+				POS(inum, U64_MAX),
+				0, k, ret) {
+		if (k.k->type != KEY_TYPE_dirent)
+			continue;
+
+		d = bkey_s_c_to_dirent(k);
+		if (d.v->d_type == DT_DIR)
+			subdirs++;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+				    u32 *subvol)
+{
+	struct bch_snapshot s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
+					  POS(0, snapshot), 0,
+					  snapshot, &s);
+	if (!ret)
+		*subvol = le32_to_cpu(s.subvol);
+	else if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot %u not found", snapshot);
+	return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+			   u32 *snapshot, u64 *inum)
+{
+	struct bch_subvolume s;
+	int ret;
+
+	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+	*snapshot = le32_to_cpu(s.snapshot);
+	*inum = le64_to_cpu(s.inode);
+	return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+			 u32 *snapshot, u64 *inum)
+{
+	return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+			     POS(0, inode_nr),
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, inode);
+err:
+	bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			  struct bch_inode_unpacked *inode,
+			  u32 *snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inode_nr, *snapshot), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bkey_is_inode(k.k)
+		? bch2_inode_unpack(k, inode)
+		: -BCH_ERR_ENOENT_inode;
+	if (!ret)
+		*snapshot = iter.pos.snapshot;
+err:
+	bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
+{
+	return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
+}
+
+static int __lookup_dirent(struct btree_trans *trans,
+			   struct bch_hash_info hash_info,
+			   subvol_inum dir, struct qstr *name,
+			   u64 *target, unsigned *type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+			       &hash_info, dir, name, 0);
+	if (ret)
+		return ret;
+
+	d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+	*target = le64_to_cpu(d.v->d_inum);
+	*type = d.v->d_type;
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+static int __write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static int fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW,
+				  __write_inode(trans, inode, snapshot));
+	if (ret)
+		bch_err_fn(trans->c, ret);
+	return ret;
+}
+
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bch_inode_unpacked dir_inode;
+	struct bch_hash_info dir_hash_info;
+	int ret;
+
+	ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+	if (ret)
+		goto err;
+
+	dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+
+	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				  &dir_hash_info, &iter,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+			    struct bch_inode_unpacked *lostfound)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root;
+	struct bch_hash_info root_hash_info;
+	struct qstr lostfound_str = QSTR("lost+found");
+	subvol_inum root_inum = { .subvol = subvol };
+	u64 inum = 0;
+	unsigned d_type = 0;
+	u32 snapshot;
+	int ret;
+
+	ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+	if (ret)
+		return ret;
+
+	ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+	if (ret)
+		return ret;
+
+	root_hash_info = bch2_hash_info_init(c, &root);
+
+	ret = __lookup_dirent(trans, root_hash_info, root_inum,
+			    &lostfound_str, &inum, &d_type);
+	if (bch2_err_matches(ret, ENOENT)) {
+		bch_notice(c, "creating lost+found");
+		goto create_lostfound;
+	}
+
+	bch_err_fn(c, ret);
+	if (ret)
+		return ret;
+
+	if (d_type != DT_DIR) {
+		bch_err(c, "error looking up lost+found: not a directory");
+		return -BCH_ERR_ENOENT_not_directory;
+	}
+
+	/*
+	 * The bch2_check_dirents pass has already run, dangling dirents
+	 * shouldn't exist here:
+	 */
+	return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+	bch2_inode_init_early(c, lostfound);
+
+	ret = bch2_create_trans(trans, root_inum, &root,
+				lostfound, &lostfound_str,
+				0, 0, S_IFDIR|0700, 0, NULL, NULL,
+				(subvol_inum) { }, 0);
+	bch_err_msg(c, ret, "creating lost+found");
+	return ret;
+}
+
+static int __reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode,
+			  u32 inode_snapshot)
+{
+	struct bch_hash_info dir_hash;
+	struct bch_inode_unpacked lostfound;
+	char name_buf[20];
+	struct qstr name;
+	u64 dir_offset = 0;
+	u32 subvol;
+	int ret;
+
+	ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+	if (ret)
+		return ret;
+
+	ret = lookup_lostfound(trans, subvol, &lostfound);
+	if (ret)
+		return ret;
+
+	if (S_ISDIR(inode->bi_mode)) {
+		lostfound.bi_nlink++;
+
+		ret = __write_inode(trans, &lostfound, U32_MAX);
+		if (ret)
+			return ret;
+	}
+
+	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
+
+	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	name = (struct qstr) QSTR(name_buf);
+
+	ret = bch2_dirent_create(trans,
+				 (subvol_inum) {
+					.subvol = subvol,
+					.inum = lostfound.bi_inum,
+				 },
+				 &dir_hash,
+				 inode_d_type(inode),
+				 &name, inode->bi_inum, &dir_offset,
+				 BCH_HASH_SET_MUST_CREATE);
+	if (ret)
+		return ret;
+
+	inode->bi_dir		= lostfound.bi_inum;
+	inode->bi_dir_offset	= dir_offset;
+
+	return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+			  struct bch_inode_unpacked *inode,
+			  u32 inode_snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL,
+				  BTREE_INSERT_LAZY_RW|
+				  BTREE_INSERT_NOFAIL,
+			__reattach_inode(trans, inode, inode_snapshot));
+	bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
+	return ret;
+}
+
+static int remove_backpointer(struct btree_trans *trans,
+			      struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+				     POS(inode->bi_dir, inode->bi_dir_offset), 0,
+				     dirent);
+	ret =   bkey_err(d) ?:
+		__remove_dirent(trans, d.k->p);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+struct snapshots_seen_entry {
+	u32				id;
+	u32				equiv;
+};
+
+struct snapshots_seen {
+	struct bpos			pos;
+	DARRAY(struct snapshots_seen_entry) ids;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+	darray_exit(&s->ids);
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+	memset(s, 0, sizeof(*s));
+}
+
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+	struct snapshots_seen_entry *i, n = {
+		.id	= id,
+		.equiv	= bch2_snapshot_equiv(c, id),
+	};
+	int ret = 0;
+
+	darray_for_each(s->ids, i) {
+		if (i->id == id)
+			return 0;
+		if (i->id > id)
+			break;
+	}
+
+	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
+				 enum btree_id btree_id, struct bpos pos)
+{
+	struct snapshots_seen_entry *i, n = {
+		.id	= pos.snapshot,
+		.equiv	= bch2_snapshot_equiv(c, pos.snapshot),
+	};
+	int ret = 0;
+
+	if (!bkey_eq(s->pos, pos))
+		s->ids.nr = 0;
+
+	s->pos = pos;
+	s->pos.snapshot = n.equiv;
+
+	darray_for_each(s->ids, i) {
+		if (i->id == n.id)
+			return 0;
+
+		/*
+		 * We currently don't rigorously track for snapshot cleanup
+		 * needing to be run, so it shouldn't be a fsck error yet:
+		 */
+		if (i->equiv == n.equiv) {
+			bch_err(c, "snapshot deletion did not finish:\n"
+				"  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
+				bch2_btree_id_str(btree_id),
+				pos.inode, pos.offset,
+				i->id, n.id, n.equiv);
+			set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+			return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
+		}
+	}
+
+	ret = darray_push(&s->ids, n);
+	if (ret)
+		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+			s->ids.size);
+	return ret;
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * @c:		filesystem handle
+ * @seen:	list of snapshot ids already seen at current position
+ * @id:		descendent snapshot id
+ * @ancestor:	ancestor snapshot id
+ *
+ * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+				    u32 id, u32 ancestor)
+{
+	ssize_t i;
+
+	EBUG_ON(id > ancestor);
+	EBUG_ON(!bch2_snapshot_is_equiv(c, id));
+	EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
+
+	/* @ancestor should be the snapshot most recently added to @seen */
+	EBUG_ON(ancestor != seen->pos.snapshot);
+	EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
+
+	if (id == ancestor)
+		return true;
+
+	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+		return false;
+
+	/*
+	 * We know that @id is a descendant of @ancestor, we're checking if
+	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
+	 * @ascestor and with @id as a descendent.
+	 *
+	 * But we already know that we're scanning IDs between @id and @ancestor
+	 * numerically, since snapshot ID lists are kept sorted, so if we find
+	 * an id that's an ancestor of @id we're done:
+	 */
+
+	for (i = seen->ids.nr - 2;
+	     i >= 0 && seen->ids.data[i].equiv >= id;
+	     --i)
+		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
+			return false;
+
+	return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * @c:		filesystem handle
+ * @s:		list of snapshot IDs already seen at @src
+ * @src:	snapshot ID of src key
+ * @dst:	snapshot ID of dst key
+ * Returns:	true if there is some snapshot in which @dst is visible
+ *
+ * Assumes we're visiting @src keys in natural key order
+ */
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+			u32 src, u32 dst)
+{
+	return dst <= src
+		? key_visible_in_snapshot(c, s, dst, src)
+		: bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+static int ref_visible2(struct bch_fs *c,
+			u32 src, struct snapshots_seen *src_seen,
+			u32 dst, struct snapshots_seen *dst_seen)
+{
+	src = bch2_snapshot_equiv(c, src);
+	dst = bch2_snapshot_equiv(c, dst);
+
+	if (dst > src) {
+		swap(dst, src);
+		swap(dst_seen, src_seen);
+	}
+	return key_visible_in_snapshot(c, src_seen, dst, src);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
+	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
+	     (_i)->snapshot <= (_snapshot); _i++)					\
+		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
+struct inode_walker_entry {
+	struct bch_inode_unpacked inode;
+	u32			snapshot;
+	bool			seen_this_pos;
+	u64			count;
+};
+
+struct inode_walker {
+	bool				first_this_inode;
+	bool				recalculate_sums;
+	struct bpos			last_pos;
+
+	DARRAY(struct inode_walker_entry) inodes;
+};
+
+static void inode_walker_exit(struct inode_walker *w)
+{
+	darray_exit(&w->inodes);
+}
+
+static struct inode_walker inode_walker_init(void)
+{
+	return (struct inode_walker) { 0, };
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+		     struct bkey_s_c inode)
+{
+	struct bch_inode_unpacked u;
+
+	BUG_ON(bch2_inode_unpack(inode, &u));
+
+	return darray_push(&w->inodes, ((struct inode_walker_entry) {
+		.inode		= u,
+		.snapshot	= bch2_snapshot_equiv(c, inode.k->p.snapshot),
+	}));
+}
+
+static int get_inodes_all_snapshots(struct btree_trans *trans,
+				    struct inode_walker *w, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	w->recalculate_sums = false;
+	w->inodes.nr = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (k.k->p.offset != inum)
+			break;
+
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		return ret;
+
+	w->first_this_inode = true;
+
+	return trans_was_restarted(trans, restart_count);
+}
+
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
+			  u32 snapshot, bool is_whiteout)
+{
+	struct inode_walker_entry *i;
+
+	snapshot = bch2_snapshot_equiv(c, snapshot);
+
+	darray_for_each(w->inodes, i)
+		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
+			goto found;
+
+	return NULL;
+found:
+	BUG_ON(snapshot > i->snapshot);
+
+	if (snapshot != i->snapshot && !is_whiteout) {
+		struct inode_walker_entry new = *i;
+		size_t pos;
+		int ret;
+
+		new.snapshot = snapshot;
+		new.count = 0;
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
+			 w->last_pos.inode, snapshot, i->snapshot);
+
+		while (i > w->inodes.data && i[-1].snapshot > snapshot)
+			--i;
+
+		pos = i - w->inodes.data;
+		ret = darray_insert_item(&w->inodes, pos, new);
+		if (ret)
+			return ERR_PTR(ret);
+
+		i = w->inodes.data + pos;
+	}
+
+	return i;
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+					     struct inode_walker *w, struct bpos pos,
+					     bool is_whiteout)
+{
+	if (w->last_pos.inode != pos.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+		if (ret)
+			return ERR_PTR(ret);
+	} else if (bkey_cmp(w->last_pos, pos)) {
+		struct inode_walker_entry *i;
+
+		darray_for_each(w->inodes, i)
+			i->seen_this_pos = false;
+
+	}
+
+	w->last_pos = pos;
+
+	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+				struct inode_walker *w,
+				struct snapshots_seen *s,
+				u64 inum)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	w->inodes.nr = 0;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+		if (k.k->p.offset != inum)
+			break;
+
+		if (!ref_visible(c, s, s->pos.snapshot, equiv))
+			continue;
+
+		if (bkey_is_inode(k.k))
+			add_inode(c, w, k);
+
+		if (equiv >= s->pos.snapshot)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
+				bkey_in_missing_snapshot,
+				"key in missing snapshot: %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+		ret = bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int hash_redo_key(struct btree_trans *trans,
+			 const struct bch_hash_desc desc,
+			 struct bch_hash_info *hash_info,
+			 struct btree_iter *k_iter, struct bkey_s_c k)
+{
+	struct bkey_i *delete;
+	struct bkey_i *tmp;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	if (IS_ERR(delete))
+		return PTR_ERR(delete);
+
+	tmp = bch2_bkey_make_mut_noupdate(trans, k);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	bkey_init(&delete->k);
+	delete->k.p = k_iter->pos;
+	return  bch2_btree_iter_traverse(k_iter) ?:
+		bch2_trans_update(trans, k_iter, delete, 0) ?:
+		bch2_hash_set_snapshot(trans, desc, hash_info,
+				       (subvol_inum) { 0, k.k->p.inode },
+				       k.k->p.snapshot, tmp,
+				       BCH_HASH_SET_MUST_CREATE,
+				       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BTREE_INSERT_NOFAIL|
+				  BTREE_INSERT_LAZY_RW);
+}
+
+static int hash_check_key(struct btree_trans *trans,
+			  const struct bch_hash_desc desc,
+			  struct bch_hash_info *hash_info,
+			  struct btree_iter *k_iter, struct bkey_s_c hash_k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct printbuf buf = PRINTBUF;
+	struct bkey_s_c k;
+	u64 hash;
+	int ret = 0;
+
+	if (hash_k.k->type != desc.key_type)
+		return 0;
+
+	hash = desc.hash_bkey(hash_info, hash_k);
+
+	if (likely(hash == hash_k.k->p.offset))
+		return 0;
+
+	if (hash_k.k->p.offset < hash)
+		goto bad_hash;
+
+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
+				     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				     BTREE_ITER_SLOTS, k, ret) {
+		if (bkey_eq(k.k->p, hash_k.k->p))
+			break;
+
+		if (fsck_err_on(k.k->type == desc.key_type &&
+				!desc.cmp_bkey(k, hash_k), c,
+				hash_table_key_duplicate,
+				"duplicate hash table keys:\n%s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, hash_k),
+				 buf.buf))) {
+			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
+			break;
+		}
+
+		if (bkey_deleted(k.k)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto bad_hash;
+		}
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+	return ret;
+bad_hash:
+	if (fsck_err(c, hash_table_key_wrong_offset,
+		     "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+		     bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
+		     (printbuf_reset(&buf),
+		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
+		ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
+		bch_err_fn(c, ret);
+		if (ret)
+			return ret;
+		ret = -BCH_ERR_transaction_restart_nested;
+	}
+fsck_err:
+	goto out;
+}
+
+static int check_inode(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_inode_unpacked *prev,
+		       struct snapshots_seen *s,
+		       bool full)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	bool do_update = false;
+	int ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret)
+		return 0;
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (!full &&
+	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
+			    BCH_INODE_i_sectors_dirty|
+			    BCH_INODE_unlinked)))
+		return 0;
+
+	if (prev->bi_inum != u.bi_inum)
+		*prev = u;
+
+	if (fsck_err_on(prev->bi_hash_seed	!= u.bi_hash_seed ||
+			inode_d_type(prev)	!= inode_d_type(&u),
+			c, inode_snapshot_mismatch,
+			"inodes in different snapshots don't match")) {
+		bch_err(c, "repair not implemented yet");
+		return -EINVAL;
+	}
+
+	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
+	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+		struct bpos new_min_pos;
+
+		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+		if (ret)
+			goto err;
+
+		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
+			return ret;
+
+		if (!bpos_eq(new_min_pos, POS_MIN))
+			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+		return 0;
+	}
+
+	if (u.bi_flags & BCH_INODE_unlinked &&
+	    (!c->sb.clean ||
+	     fsck_err(c, inode_unlinked_but_clean,
+		      "filesystem marked clean, but inode %llu unlinked",
+		      u.bi_inum))) {
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck deleting inode");
+		return ret;
+	}
+
+	if (u.bi_flags & BCH_INODE_i_size_dirty &&
+	    (!c->sb.clean ||
+	     fsck_err(c, inode_i_size_dirty_but_clean,
+		      "filesystem marked clean, but inode %llu has i_size dirty",
+		      u.bi_inum))) {
+		bch_verbose(c, "truncating inode %llu", u.bi_inum);
+
+		bch2_trans_unlock(trans);
+		bch2_fs_lazy_rw(c);
+
+		/*
+		 * XXX: need to truncate partial blocks too here - or ideally
+		 * just switch units to bytes and that issue goes away
+		 */
+		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+				     iter->pos.snapshot),
+				POS(u.bi_inum, U64_MAX),
+				0, NULL);
+		bch_err_msg(c, ret, "in fsck truncating inode");
+		if (ret)
+			return ret;
+
+		/*
+		 * We truncated without our normal sector accounting hook, just
+		 * make sure we recalculate it:
+		 */
+		u.bi_flags |= BCH_INODE_i_sectors_dirty;
+
+		u.bi_flags &= ~BCH_INODE_i_size_dirty;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
+	    (!c->sb.clean ||
+	     fsck_err(c, inode_i_sectors_dirty_but_clean,
+		      "filesystem marked clean, but inode %llu has i_sectors dirty",
+		      u.bi_inum))) {
+		s64 sectors;
+
+		bch_verbose(c, "recounting sectors for inode %llu",
+			    u.bi_inum);
+
+		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
+		if (sectors < 0) {
+			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
+			return sectors;
+		}
+
+		u.bi_sectors = sectors;
+		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
+		do_update = true;
+	}
+
+	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
+		u.bi_dir = 0;
+		u.bi_dir_offset = 0;
+		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
+		do_update = true;
+	}
+
+	if (do_update) {
+		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		bch_err_msg(c, ret, "in fsck updating inode");
+		if (ret)
+			return ret;
+	}
+err:
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+noinline_for_stack
+int bch2_check_inodes(struct bch_fs *c)
+{
+	bool full = c->opts.fsck;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bch_inode_unpacked prev = { 0 };
+	struct snapshots_seen s;
+	struct bkey_s_c k;
+	int ret;
+
+	snapshots_seen_init(&s);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+			POS_MIN,
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_inode(trans, &iter, k, &prev, &s, full));
+
+	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *inode)
+{
+	return d.v->d_type == DT_SUBVOL
+		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+				    struct bch_inode_unpacked *inode,
+				    u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	d = dirent_get_by_pos(trans, &iter,
+			SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+	ret = bkey_err(d);
+	if (ret)
+		return bch2_err_matches(ret, ENOENT) ? 0 : ret;
+
+	ret = dirent_points_to_inode(d, inode);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+	s64 count2;
+
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_sectors == i->count)
+			continue;
+
+		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+
+		if (w->recalculate_sums)
+			i->count = count2;
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+				w->last_pos.inode, i->snapshot, i->count, count2);
+			return -BCH_ERR_internal_fsck_err;
+		}
+
+		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+				c, inode_i_sectors_wrong,
+				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+				w->last_pos.inode, i->snapshot,
+				i->inode.bi_sectors, i->count)) {
+			i->inode.bi_sectors = i->count;
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+struct extent_end {
+	u32			snapshot;
+	u64			offset;
+	struct snapshots_seen	seen;
+};
+
+struct extent_ends {
+	struct bpos			last_pos;
+	DARRAY(struct extent_end)	e;
+};
+
+static void extent_ends_reset(struct extent_ends *extent_ends)
+{
+	struct extent_end *i;
+
+	darray_for_each(extent_ends->e, i)
+		snapshots_seen_exit(&i->seen);
+
+	extent_ends->e.nr = 0;
+}
+
+static void extent_ends_exit(struct extent_ends *extent_ends)
+{
+	extent_ends_reset(extent_ends);
+	darray_exit(&extent_ends->e);
+}
+
+static void extent_ends_init(struct extent_ends *extent_ends)
+{
+	memset(extent_ends, 0, sizeof(*extent_ends));
+}
+
+static int extent_ends_at(struct bch_fs *c,
+			  struct extent_ends *extent_ends,
+			  struct snapshots_seen *seen,
+			  struct bkey_s_c k)
+{
+	struct extent_end *i, n = (struct extent_end) {
+		.offset		= k.k->p.offset,
+		.snapshot	= k.k->p.snapshot,
+		.seen		= *seen,
+	};
+
+	n.seen.ids.data = kmemdup(seen->ids.data,
+			      sizeof(seen->ids.data[0]) * seen->ids.size,
+			      GFP_KERNEL);
+	if (!n.seen.ids.data)
+		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->snapshot == k.k->p.snapshot) {
+			snapshots_seen_exit(&i->seen);
+			*i = n;
+			return 0;
+		}
+
+		if (i->snapshot >= k.k->p.snapshot)
+			break;
+	}
+
+	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
+}
+
+static int overlapping_extents_found(struct btree_trans *trans,
+				     enum btree_id btree,
+				     struct bpos pos1, struct snapshots_seen *pos1_seen,
+				     struct bkey pos2,
+				     bool *fixed,
+				     struct extent_end *extent_end)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+	struct btree_iter iter1, iter2 = { NULL };
+	struct bkey_s_c k1, k2;
+	int ret;
+
+	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
+
+	bch2_trans_iter_init(trans, &iter1, btree, pos1,
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_NOT_EXTENTS);
+	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
+	ret = bkey_err(k1);
+	if (ret)
+		goto err;
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k1);
+
+	if (!bpos_eq(pos1, k1.k->p)) {
+		prt_str(&buf, "\n  wanted\n  ");
+		bch2_bpos_to_text(&buf, pos1);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_to_text(&buf, &pos2);
+
+		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	bch2_trans_copy_iter(&iter2, &iter1);
+
+	while (1) {
+		bch2_btree_iter_advance(&iter2);
+
+		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
+		ret = bkey_err(k2);
+		if (ret)
+			goto err;
+
+		if (bpos_ge(k2.k->p, pos2.p))
+			break;
+	}
+
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, k2);
+
+	if (bpos_gt(k2.k->p, pos2.p) ||
+	    pos2.size != k2.k->size) {
+		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
+			__func__, buf.buf);
+		ret = -BCH_ERR_internal_fsck_err;
+		goto err;
+	}
+
+	prt_printf(&buf, "\n  overwriting %s extent",
+		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
+
+	if (fsck_err(c, extent_overlapping,
+		     "overlapping extents%s", buf.buf)) {
+		struct btree_iter *old_iter = &iter1;
+		struct disk_reservation res = { 0 };
+
+		if (pos1.snapshot < pos2.p.snapshot) {
+			old_iter = &iter2;
+			swap(k1, k2);
+		}
+
+		trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+
+		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+				k1, k2) ?:
+			bch2_trans_commit(trans, &res, NULL,
+				BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+		bch2_disk_reservation_put(c, &res);
+
+		if (ret)
+			goto err;
+
+		*fixed = true;
+
+		if (pos1.snapshot == pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent, and did the overwrite
+			 * in the same snapshot:
+			 */
+			extent_end->offset = bkey_start_offset(&pos2);
+		} else if (pos1.snapshot > pos2.p.snapshot) {
+			/*
+			 * We overwrote the first extent in pos2's snapshot:
+			 */
+			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+		} else {
+			/*
+			 * We overwrote the second extent - restart
+			 * check_extent() from the top:
+			 */
+			ret = -BCH_ERR_transaction_restart_nested;
+		}
+	}
+fsck_err:
+err:
+	bch2_trans_iter_exit(trans, &iter2);
+	bch2_trans_iter_exit(trans, &iter1);
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int check_overlapping_extents(struct btree_trans *trans,
+			      struct snapshots_seen *seen,
+			      struct extent_ends *extent_ends,
+			      struct bkey_s_c k,
+			      u32 equiv,
+			      struct btree_iter *iter,
+			      bool *fixed)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_end *i;
+	int ret = 0;
+
+	/* transaction restart, running again */
+	if (bpos_eq(extent_ends->last_pos, k.k->p))
+		return 0;
+
+	if (extent_ends->last_pos.inode != k.k->p.inode)
+		extent_ends_reset(extent_ends);
+
+	darray_for_each(extent_ends->e, i) {
+		if (i->offset <= bkey_start_offset(k.k))
+			continue;
+
+		if (!ref_visible2(c,
+				  k.k->p.snapshot, seen,
+				  i->snapshot, &i->seen))
+			continue;
+
+		ret = overlapping_extents_found(trans, iter->btree_id,
+						SPOS(iter->pos.inode,
+						     i->offset,
+						     i->snapshot),
+						&i->seen,
+						*k.k, fixed, i);
+		if (ret)
+			goto err;
+	}
+
+	ret = extent_ends_at(c, extent_ends, seen, k);
+	if (ret)
+		goto err;
+
+	extent_ends->last_pos = k.k->p;
+err:
+	return ret;
+}
+
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+				struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
+	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+	bkey_for_each_crc(k.k, ptrs, crc, i)
+		if (crc_is_encoded(crc) &&
+		    crc.uncompressed_size > encoded_extent_max_sectors) {
+			struct printbuf buf = PRINTBUF;
+
+			bch2_bkey_val_to_text(&buf, c, k);
+			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+	return 0;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct inode_walker *inode,
+			struct snapshots_seen *s,
+			struct extent_ends *extent_ends)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	struct printbuf buf = PRINTBUF;
+	struct bpos equiv = k.k->p;
+	int ret = 0;
+
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
+
+	if (inode->last_pos.inode != k.k->p.inode) {
+		ret = check_i_sectors(trans, inode);
+		if (ret)
+			goto err;
+	}
+
+	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		goto err;
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_whiteout) {
+		if (fsck_err_on(!i, c, extent_in_missing_inode,
+				"extent in missing inode:\n  %s",
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
+
+		if (fsck_err_on(i &&
+				!S_ISREG(i->inode.bi_mode) &&
+				!S_ISLNK(i->inode.bi_mode),
+				c, extent_in_non_reg_inode,
+				"extent in non regular inode mode %o:\n  %s",
+				i->inode.bi_mode,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			goto delete;
+
+		ret = check_overlapping_extents(trans, s, extent_ends, k,
+						equiv.snapshot, iter,
+						&inode->recalculate_sums);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Check inodes in reverse order, from oldest snapshots to newest,
+	 * starting from the inode that matches this extent's snapshot. If we
+	 * didn't have one, iterate over all inodes:
+	 */
+	if (!i)
+		i = inode->inodes.data + inode->inodes.nr - 1;
+
+	for (;
+	     inode->inodes.data && i >= inode->inodes.data;
+	     --i) {
+		if (i->snapshot > equiv.snapshot ||
+		    !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
+			continue;
+
+		if (k.k->type != KEY_TYPE_whiteout) {
+			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
+					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+					!bkey_extent_is_reservation(k),
+					c, extent_past_end_of_inode,
+					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
+					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+				struct btree_iter iter2;
+
+				bch2_trans_copy_iter(&iter2, iter);
+				bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
+				ret =   bch2_btree_iter_traverse(&iter2) ?:
+					bch2_btree_delete_at(trans, &iter2,
+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+				bch2_trans_iter_exit(trans, &iter2);
+				if (ret)
+					goto err;
+
+				iter->k.type = KEY_TYPE_whiteout;
+			}
+
+			if (bkey_extent_is_allocation(k.k))
+				i->count += k.k->size;
+		}
+
+		i->seen_this_pos = true;
+	}
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+delete:
+	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	goto out;
+}
+
+/*
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
+ * that i_size an i_sectors are consistent
+ */
+int bch2_check_extents(struct bch_fs *c)
+{
+	struct inode_walker w = inode_walker_init();
+	struct snapshots_seen s;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct extent_ends extent_ends;
+	struct disk_reservation res = { 0 };
+	int ret = 0;
+
+	snapshots_seen_init(&s);
+	extent_ends_init(&extent_ends);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+			&res, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+		bch2_disk_reservation_put(c, &res);
+		check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+		check_extent_overbig(trans, &iter, k);
+	})) ?:
+	check_i_sectors(trans, &w);
+
+	bch2_disk_reservation_put(c, &res);
+	extent_ends_exit(&extent_ends);
+	inode_walker_exit(&w);
+	snapshots_seen_exit(&s);
+	bch2_trans_put(trans);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct disk_reservation res = { 0 };
+	int ret = 0;
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+			POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			&res, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+		bch2_disk_reservation_put(c, &res);
+		check_extent_overbig(trans, &iter, k);
+	}));
+
+	bch2_disk_reservation_put(c, &res);
+	bch2_trans_put(trans);
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+	s64 count2;
+
+	darray_for_each(w->inodes, i) {
+		if (i->inode.bi_nlink == i->count)
+			continue;
+
+		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+		if (count2 < 0)
+			return count2;
+
+		if (i->count != count2) {
+			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+				i->count, count2);
+			i->count = count2;
+			if (i->inode.bi_nlink == i->count)
+				continue;
+		}
+
+		if (fsck_err_on(i->inode.bi_nlink != i->count,
+				c, inode_dir_wrong_nlink,
+				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+			i->inode.bi_nlink = i->count;
+			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			if (ret)
+				break;
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	bool backpointer_exists = true;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!target->bi_dir &&
+	    !target->bi_dir_offset) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+
+		ret = __write_inode(trans, target, target_snapshot);
+		if (ret)
+			goto err;
+	}
+
+	if (!inode_points_to_dirent(target, d)) {
+		ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+		if (ret < 0)
+			goto err;
+
+		backpointer_exists = ret;
+		ret = 0;
+
+		if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
+				c, inode_dir_multiple_links,
+				"directory %llu with multiple links",
+				target->bi_inum)) {
+			ret = __remove_dirent(trans, d.k->p);
+			goto out;
+		}
+
+		if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+				c, inode_multiple_links_but_nlink_0,
+				"inode %llu type %s has multiple links but i_nlink 0",
+				target->bi_inum, bch2_d_types[d.v->d_type])) {
+			target->bi_nlink++;
+			target->bi_flags &= ~BCH_INODE_unlinked;
+
+			ret = __write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
+		}
+
+		if (fsck_err_on(!backpointer_exists,
+				c, inode_wrong_backpointer,
+				"inode %llu:%u has wrong backpointer:\n"
+				"got       %llu:%llu\n"
+				"should be %llu:%llu",
+				target->bi_inum, target_snapshot,
+				target->bi_dir,
+				target->bi_dir_offset,
+				d.k->p.inode,
+				d.k->p.offset)) {
+			target->bi_dir		= d.k->p.inode;
+			target->bi_dir_offset	= d.k->p.offset;
+
+			ret = __write_inode(trans, target, target_snapshot);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (fsck_err_on(d.v->d_type != inode_d_type(target),
+			c, dirent_d_type_wrong,
+			"incorrect d_type: got %s, should be %s:\n%s",
+			bch2_d_type_str(d.v->d_type),
+			bch2_d_type_str(inode_d_type(target)),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_type = inode_d_type(target);
+
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			goto err;
+
+		d = dirent_i_to_s_c(n);
+	}
+
+	if (d.v->d_type == DT_SUBVOL &&
+	    target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+	    (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+	     fsck_err(c, dirent_d_parent_subvol_wrong,
+		      "dirent has wrong d_parent_subvol field: got %u, should be %u",
+		      le32_to_cpu(d.v->d_parent_subvol),
+		      target->bi_parent_subvol))) {
+		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		bkey_reassemble(&n->k_i, d.s_c);
+		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+		if (ret)
+			goto err;
+
+		d = dirent_i_to_s_c(n);
+	}
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+			struct bkey_s_c k,
+			struct bch_hash_info *hash_info,
+			struct inode_walker *dir,
+			struct inode_walker *target,
+			struct snapshots_seen *s)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_dirent d;
+	struct inode_walker_entry *i;
+	struct printbuf buf = PRINTBUF;
+	struct bpos equiv;
+	int ret = 0;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret) {
+		ret = ret < 0 ? ret : 0;
+		goto out;
+	}
+
+	equiv = k.k->p;
+	equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+	if (ret)
+		goto err;
+
+	if (k.k->type == KEY_TYPE_whiteout)
+		goto out;
+
+	if (dir->last_pos.inode != k.k->p.inode) {
+		ret = check_subdir_count(trans, dir);
+		if (ret)
+			goto err;
+	}
+
+	BUG_ON(!iter->path->should_be_locked);
+
+	i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret < 0)
+		goto err;
+
+	if (dir->first_this_inode && dir->inodes.nr)
+		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
+	dir->first_this_inode = false;
+
+	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
+			"dirent in nonexisting directory:\n%s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		goto out;
+	}
+
+	if (!i)
+		goto out;
+
+	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
+			c, dirent_in_non_dir_inode,
+			"dirent in non directory inode type %s:\n%s",
+			bch2_d_type_str(inode_d_type(&i->inode)),
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto out;
+	}
+
+	ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		/* dirent has been deleted */
+		ret = 0;
+		goto out;
+	}
+
+	if (k.k->type != KEY_TYPE_dirent)
+		goto out;
+
+	d = bkey_s_c_to_dirent(k);
+
+	if (d.v->d_type == DT_SUBVOL) {
+		struct bch_inode_unpacked subvol_root;
+		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+		u32 target_snapshot;
+		u64 target_inum;
+
+		ret = __subvol_lookup(trans, target_subvol,
+				      &target_snapshot, &target_inum);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret, c, dirent_to_missing_subvol,
+				"dirent points to missing subvolume %u",
+				le32_to_cpu(d.v->d_child_subvol))) {
+			ret = __remove_dirent(trans, d.k->p);
+			goto err;
+		}
+
+		ret = __lookup_inode(trans, target_inum,
+				   &subvol_root, &target_snapshot);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (fsck_err_on(ret, c, subvol_to_missing_root,
+				"subvolume %u points to missing subvolume root %llu",
+				target_subvol,
+				target_inum)) {
+			bch_err(c, "repair not implemented yet");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+				c, subvol_root_wrong_bi_subvol,
+				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+				target_inum,
+				subvol_root.bi_subvol, target_subvol)) {
+			subvol_root.bi_subvol = target_subvol;
+			ret = __write_inode(trans, &subvol_root, target_snapshot);
+			if (ret)
+				goto err;
+		}
+
+		ret = check_dirent_target(trans, iter, d, &subvol_root,
+					  target_snapshot);
+		if (ret)
+			goto err;
+	} else {
+		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(!target->inodes.nr,
+				c, dirent_to_missing_inode,
+				"dirent points to missing inode: (equiv %u)\n%s",
+				equiv.snapshot,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k),
+				 buf.buf))) {
+			ret = __remove_dirent(trans, d.k->p);
+			if (ret)
+				goto err;
+		}
+
+		darray_for_each(target->inodes, i) {
+			ret = check_dirent_target(trans, iter, d,
+						  &i->inode, i->snapshot);
+			if (ret)
+				goto err;
+		}
+	}
+
+	if (d.v->d_type == DT_DIR)
+		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+			i->count++;
+
+out:
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+int bch2_check_dirents(struct bch_fs *c)
+{
+	struct inode_walker dir = inode_walker_init();
+	struct inode_walker target = inode_walker_init();
+	struct snapshots_seen s;
+	struct bch_hash_info hash_info;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	snapshots_seen_init(&s);
+
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+
+	bch2_trans_put(trans);
+	snapshots_seen_exit(&s);
+	inode_walker_exit(&dir);
+	inode_walker_exit(&target);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+		       struct bkey_s_c k,
+		       struct bch_hash_info *hash_info,
+		       struct inode_walker *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct inode_walker_entry *i;
+	int ret;
+
+	ret = check_key_has_snapshot(trans, iter, k);
+	if (ret)
+		return ret;
+
+	i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+	ret = PTR_ERR_OR_ZERO(i);
+	if (ret)
+		return ret;
+
+	if (inode->first_this_inode && inode->inodes.nr)
+		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
+	inode->first_this_inode = false;
+
+	if (fsck_err_on(!i, c, xattr_in_missing_inode,
+			"xattr for missing inode %llu",
+			k.k->p.inode))
+		return bch2_btree_delete_at(trans, iter, 0);
+
+	if (!i)
+		return 0;
+
+	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Walk xattrs: verify that they all have a corresponding inode
+ */
+int bch2_check_xattrs(struct bch_fs *c)
+{
+	struct inode_walker inode = inode_walker_init();
+	struct bch_hash_info hash_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+			POS(BCACHEFS_ROOT_INO, 0),
+			BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+			k,
+			NULL, NULL,
+			BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_xattr(trans, &iter, k, &hash_info, &inode)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int check_root_trans(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked root_inode;
+	u32 snapshot;
+	u64 inum;
+	int ret;
+
+	ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+				"root subvol missing")) {
+		struct bkey_i_subvolume root_subvol;
+
+		snapshot	= U32_MAX;
+		inum		= BCACHEFS_ROOT_INO;
+
+		bkey_subvolume_init(&root_subvol.k_i);
+		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+		root_subvol.v.flags	= 0;
+		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
+		root_subvol.v.inode	= cpu_to_le64(inum);
+		ret = commit_do(trans, NULL, NULL,
+				      BTREE_INSERT_NOFAIL|
+				      BTREE_INSERT_LAZY_RW,
+			bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
+					    &root_subvol.k_i, 0));
+		bch_err_msg(c, ret, "writing root subvol");
+		if (ret)
+			goto err;
+
+	}
+
+	ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+				"root directory missing") ||
+	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+				c, root_inode_not_dir,
+				"root inode not a directory")) {
+		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+				0, NULL);
+		root_inode.bi_inum = inum;
+
+		ret = __write_inode(trans, &root_inode, snapshot);
+		bch_err_msg(c, ret, "writing root inode");
+	}
+err:
+fsck_err:
+	return ret;
+}
+
+/* Get root directory, create if it doesn't exist: */
+int bch2_check_root(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_trans_do(c, NULL, NULL,
+			     BTREE_INSERT_NOFAIL|
+			     BTREE_INSERT_LAZY_RW,
+		check_root_trans(trans));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct pathbuf_entry {
+	u64	inum;
+	u32	snapshot;
+};
+
+typedef DARRAY(struct pathbuf_entry) pathbuf;
+
+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
+{
+	struct pathbuf_entry *i;
+
+	darray_for_each(*p, i)
+		if (i->inum	== inum &&
+		    i->snapshot	== snapshot)
+			return true;
+
+	return false;
+}
+
+static int path_down(struct bch_fs *c, pathbuf *p,
+		     u64 inum, u32 snapshot)
+{
+	int ret = darray_push(p, ((struct pathbuf_entry) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	}));
+
+	if (ret)
+		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+			p->size);
+	return ret;
+}
+
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
+static int check_path(struct btree_trans *trans,
+		      pathbuf *p,
+		      struct bch_inode_unpacked *inode,
+		      u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	snapshot = bch2_snapshot_equiv(c, snapshot);
+	p->nr = 0;
+
+	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+		struct btree_iter dirent_iter;
+		struct bkey_s_c_dirent d;
+		u32 parent_snapshot = snapshot;
+
+		if (inode->bi_subvol) {
+			u64 inum;
+
+			ret = subvol_lookup(trans, inode->bi_parent_subvol,
+					    &parent_snapshot, &inum);
+			if (ret)
+				break;
+		}
+
+		ret = lockrestart_do(trans,
+			PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+					  SPOS(inode->bi_dir, inode->bi_dir_offset,
+					       parent_snapshot))).k));
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			break;
+
+		if (!ret && !dirent_points_to_inode(d, inode)) {
+			bch2_trans_iter_exit(trans, &dirent_iter);
+			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+		}
+
+		if (bch2_err_matches(ret, ENOENT)) {
+			if (fsck_err(c,  inode_unreachable,
+				     "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+				     inode->bi_inum, snapshot,
+				     bch2_d_type_str(inode_d_type(inode)),
+				     inode->bi_nlink,
+				     inode->bi_dir,
+				     inode->bi_dir_offset))
+				ret = reattach_inode(trans, inode, snapshot);
+			break;
+		}
+
+		bch2_trans_iter_exit(trans, &dirent_iter);
+
+		if (!S_ISDIR(inode->bi_mode))
+			break;
+
+		ret = path_down(c, p, inode->bi_inum, snapshot);
+		if (ret) {
+			bch_err(c, "memory allocation failure");
+			return ret;
+		}
+
+		snapshot = parent_snapshot;
+
+		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+		if (ret) {
+			/* Should have been caught in dirents pass */
+			bch_err(c, "error looking up parent directory: %i", ret);
+			break;
+		}
+
+		if (path_is_dup(p, inode->bi_inum, snapshot)) {
+			struct pathbuf_entry *i;
+
+			/* XXX print path */
+			bch_err(c, "directory structure loop");
+
+			darray_for_each(*p, i)
+				pr_err("%llu:%u", i->inum, i->snapshot);
+			pr_err("%llu:%u", inode->bi_inum, snapshot);
+
+			if (!fsck_err(c, dir_loop,
+				      "directory structure loop"))
+				return 0;
+
+			ret = commit_do(trans, NULL, NULL,
+					      BTREE_INSERT_NOFAIL|
+					      BTREE_INSERT_LAZY_RW,
+					remove_backpointer(trans, inode));
+			if (ret) {
+				bch_err(c, "error removing dirent: %i", ret);
+				break;
+			}
+
+			ret = reattach_inode(trans, inode, snapshot);
+		}
+	}
+fsck_err:
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Check for unreachable inodes, as well as loops in the directory structure:
+ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
+ * unreachable:
+ */
+int bch2_check_directory_structure(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked u;
+	pathbuf path = { 0, };
+	int ret;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (!bkey_is_inode(k.k))
+			continue;
+
+		ret = bch2_inode_unpack(k, &u);
+		if (ret) {
+			/* Should have been caught earlier in fsck: */
+			bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
+			break;
+		}
+
+		if (u.bi_flags & BCH_INODE_unlinked)
+			continue;
+
+		ret = check_path(trans, &path, &u, iter.pos.snapshot);
+		if (ret)
+			break;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	darray_exit(&path);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+struct nlink_table {
+	size_t		nr;
+	size_t		size;
+
+	struct nlink {
+		u64	inum;
+		u32	snapshot;
+		u32	count;
+	}		*d;
+};
+
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+		     u64 inum, u32 snapshot)
+{
+	if (t->nr == t->size) {
+		size_t new_size = max_t(size_t, 128UL, t->size * 2);
+		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
+
+		if (!d) {
+			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+				new_size);
+			return -BCH_ERR_ENOMEM_fsck_add_nlink;
+		}
+
+		if (t->d)
+			memcpy(d, t->d, t->size * sizeof(t->d[0]));
+		kvfree(t->d);
+
+		t->d = d;
+		t->size = new_size;
+	}
+
+
+	t->d[t->nr++] = (struct nlink) {
+		.inum		= inum,
+		.snapshot	= snapshot,
+	};
+
+	return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+	const struct nlink *l = _l;
+	const struct nlink *r = _r;
+
+	return cmp_int(l->inum, r->inum);
+}
+
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+		     struct nlink_table *links,
+		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
+{
+	struct nlink *link, key = {
+		.inum = inum, .snapshot = U32_MAX,
+	};
+
+	if (inum < range_start || inum >= range_end)
+		return;
+
+	link = __inline_bsearch(&key, links->d, links->nr,
+				sizeof(links->d[0]), nlink_cmp);
+	if (!link)
+		return;
+
+	while (link > links->d && link[0].inum == link[-1].inum)
+		--link;
+
+	for (; link < links->d + links->nr && link->inum == inum; link++)
+		if (ref_visible(c, s, snapshot, link->snapshot)) {
+			link->count++;
+			if (link->snapshot >= snapshot)
+				break;
+		}
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+				       struct nlink_table *t,
+				       u64 start, u64 *end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked u;
+	int ret = 0;
+
+	for_each_btree_key(trans, iter, BTREE_ID_inodes,
+			   POS(0, start),
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		if (!bkey_is_inode(k.k))
+			continue;
+
+		/* Should never fail, checked by bch2_inode_invalid: */
+		BUG_ON(bch2_inode_unpack(k, &u));
+
+		/*
+		 * Backpointer and directory structure checks are sufficient for
+		 * directories, since they can't have hardlinks:
+		 */
+		if (S_ISDIR(u.bi_mode))
+			continue;
+
+		if (!u.bi_nlink)
+			continue;
+
+		ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+		if (ret) {
+			*end = k.k->p.offset;
+			ret = 0;
+			break;
+		}
+
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+
+	return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+				     u64 range_start, u64 range_end)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct snapshots_seen s;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent d;
+	int ret;
+
+	snapshots_seen_init(&s);
+
+	for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+			   BTREE_ITER_INTENT|
+			   BTREE_ITER_PREFETCH|
+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+		if (ret)
+			break;
+
+		switch (k.k->type) {
+		case KEY_TYPE_dirent:
+			d = bkey_s_c_to_dirent(k);
+
+			if (d.v->d_type != DT_DIR &&
+			    d.v->d_type != DT_SUBVOL)
+				inc_link(c, &s, links, range_start, range_end,
+					 le64_to_cpu(d.v->d_inum),
+					 bch2_snapshot_equiv(c, d.k->p.snapshot));
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+
+	bch2_trans_put(trans);
+	snapshots_seen_exit(&s);
+	return ret;
+}
+
+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     struct nlink_table *links,
+				     size_t *idx, u64 range_end)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct nlink *link = &links->d[*idx];
+	int ret = 0;
+
+	if (k.k->p.offset >= range_end)
+		return 1;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	BUG_ON(bch2_inode_unpack(k, &u));
+
+	if (S_ISDIR(u.bi_mode))
+		return 0;
+
+	if (!u.bi_nlink)
+		return 0;
+
+	while ((cmp_int(link->inum, k.k->p.offset) ?:
+		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
+		BUG_ON(*idx == links->nr);
+		link = &links->d[++*idx];
+	}
+
+	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+			c, inode_wrong_nlink,
+			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
+			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
+			bch2_inode_nlink_get(&u), link->count)) {
+		bch2_inode_nlink_set(&u, link->count);
+		ret = __write_inode(trans, &u, k.k->p.snapshot);
+	}
+fsck_err:
+	return ret;
+}
+
+noinline_for_stack
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+			       struct nlink_table *links,
+			       u64 range_start, u64 range_end)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t idx = 0;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS(0, range_start),
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
+	if (ret < 0) {
+		bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int bch2_check_nlinks(struct bch_fs *c)
+{
+	struct nlink_table links = { 0 };
+	u64 this_iter_range_start, next_iter_range_start = 0;
+	int ret = 0;
+
+	do {
+		this_iter_range_start = next_iter_range_start;
+		next_iter_range_start = U64_MAX;
+
+		ret = check_nlinks_find_hardlinks(c, &links,
+						  this_iter_range_start,
+						  &next_iter_range_start);
+
+		ret = check_nlinks_walk_dirents(c, &links,
+					  this_iter_range_start,
+					  next_iter_range_start);
+		if (ret)
+			break;
+
+		ret = check_nlinks_update_hardlinks(c, &links,
+					 this_iter_range_start,
+					 next_iter_range_start);
+		if (ret)
+			break;
+
+		links.nr = 0;
+	} while (next_iter_range_start != U64_MAX);
+
+	kvfree(links.d);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
+			     struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p;
+	struct bkey_i_reflink_p *u;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_reflink_p)
+		return 0;
+
+	p = bkey_s_c_to_reflink_p(k);
+
+	if (!p.v->front_pad && !p.v->back_pad)
+		return 0;
+
+	u = bch2_trans_kmalloc(trans, sizeof(*u));
+	ret = PTR_ERR_OR_ZERO(u);
+	if (ret)
+		return ret;
+
+	bkey_reassemble(&u->k_i, k);
+	u->v.front_pad	= 0;
+	u->v.back_pad	= 0;
+
+	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+int bch2_fix_reflink_p(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+		return 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_extents, POS_MIN,
+				BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
+				BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			fix_reflink_p_key(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
new file mode 100644
index 000000000000..da991e8cf27e
--- /dev/null
+++ b/fs/bcachefs/fsck.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+int bch2_check_inodes(struct bch_fs *);
+int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
+int bch2_check_dirents(struct bch_fs *);
+int bch2_check_xattrs(struct bch_fs *);
+int bch2_check_root(struct bch_fs *);
+int bch2_check_directory_structure(struct bch_fs *);
+int bch2_check_nlinks(struct bch_fs *);
+int bch2_fix_reflink_p(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
new file mode 100644
index 000000000000..9309cfeecd8d
--- /dev/null
+++ b/fs/bcachefs/inode.c
@@ -0,0 +1,1205 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_write_buffer.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "compress.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "str_hash.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "varint.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+#define x(name, ...)	#name,
+const char * const bch2_inode_opts[] = {
+	BCH_INODE_OPTS()
+	NULL,
+};
+
+static const char * const bch2_inode_flag_strs[] = {
+	BCH_INODE_FLAGS()
+	NULL
+};
+#undef  x
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+			      u64 out[2], unsigned *out_bits)
+{
+	__be64 be[2] = { 0, 0 };
+	unsigned bytes, shift;
+	u8 *p;
+
+	if (in >= end)
+		return -1;
+
+	if (!*in)
+		return -1;
+
+	/*
+	 * position of highest set bit indicates number of bytes:
+	 * shift = number of bits to remove in high byte:
+	 */
+	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
+	bytes	= byte_table[shift - 1];
+
+	if (in + bytes > end)
+		return -1;
+
+	p = (u8 *) be + 16 - bytes;
+	memcpy(p, in, bytes);
+	*p ^= (1 << 8) >> shift;
+
+	out[0] = be64_to_cpu(be[0]);
+	out[1] = be64_to_cpu(be[1]);
+	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+	return bytes;
+}
+
+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
+					   const struct bch_inode_unpacked *inode)
+{
+	struct bkey_i_inode_v3 *k = &packed->inode;
+	u8 *out = k->v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	unsigned bytes;
+	int ret;
+
+	bkey_inode_v3_init(&packed->inode.k_i);
+	packed->inode.k.p.offset	= inode->bi_inum;
+	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
+	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
+	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
+	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
+	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
+	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
+
+
+#define x(_name, _bits)							\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		ret = bch2_varint_encode_fast(out, inode->_name);	\
+		out += ret;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+									\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+									\
+		if (_bits > 64)						\
+			*out++ = 0;					\
+	}
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+	BUG_ON(out > end);
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	bytes = out - (u8 *) &packed->inode.v;
+	set_bkey_val_bytes(&packed->inode.k, bytes);
+	memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct bch_inode_unpacked unpacked;
+
+		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
+		BUG_ON(ret);
+		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
+		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
+		BUG_ON(unpacked.bi_size		!= inode->bi_size);
+		BUG_ON(unpacked.bi_version	!= inode->bi_version);
+		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
+
+#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
+			panic("unpacked %llu should be %llu",		\
+			      (u64) unpacked._name, (u64) inode->_name);
+		BCH_INODE_FIELDS_v3()
+#undef  x
+	}
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	bch2_inode_pack_inlined(packed, inode);
+}
+
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+				struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	u64 field[2];
+	unsigned fieldnr = 0, field_bits;
+	int ret;
+
+#define x(_name, _bits)					\
+	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
+		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+		memset((void *) unpacked + offset, 0,			\
+		       sizeof(*unpacked) - offset);			\
+		return 0;						\
+	}								\
+									\
+	ret = inode_decode_field(in, end, field, &field_bits);		\
+	if (ret < 0)							\
+		return ret;						\
+									\
+	if (field_bits > sizeof(unpacked->_name) * 8)			\
+		return -1;						\
+									\
+	unpacked->_name = field[1];					\
+	in += ret;
+
+	BCH_INODE_FIELDS_v2()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+				const u8 *in, const u8 *end,
+				unsigned nr_fields)
+{
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v2()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static int bch2_inode_unpack_v3(struct bkey_s_c k,
+				struct bch_inode_unpacked *unpacked)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	const u8 *in = inode.v->fields;
+	const u8 *end = bkey_val_end(inode);
+	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v[2];
+
+	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
+	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
+	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
+	unpacked->bi_mode	= INODEv3_MODE(inode.v);
+
+#define x(_name, _bits)							\
+	if (fieldnr < nr_fields) {					\
+		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+									\
+		if (_bits > 64) {					\
+			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
+			if (ret < 0)					\
+				return ret;				\
+			in += ret;					\
+		} else {						\
+			v[1] = 0;					\
+		}							\
+	} else {							\
+		v[0] = v[1] = 0;					\
+	}								\
+									\
+	unpacked->_name = v[0];						\
+	if (v[1] || v[0] != unpacked->_name)				\
+		return -1;						\
+	fieldnr++;
+
+	BCH_INODE_FIELDS_v3()
+#undef  x
+
+	/* XXX: signal if there were more fields than expected? */
+	return 0;
+}
+
+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
+					       struct bch_inode_unpacked *unpacked)
+{
+	memset(unpacked, 0, sizeof(*unpacked));
+
+	switch (k.k->type) {
+	case KEY_TYPE_inode: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= 0;
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		if (INODE_NEW_VARINT(inode.v)) {
+			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+						    bkey_val_end(inode),
+						    INODE_NR_FIELDS(inode.v));
+		} else {
+			return bch2_inode_unpack_v1(inode, unpacked);
+		}
+		break;
+	}
+	case KEY_TYPE_inode_v2: {
+		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
+		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+					    bkey_val_end(inode),
+					    INODEv2_NR_FIELDS(inode.v));
+	}
+	default:
+		BUG();
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c k,
+		      struct bch_inode_unpacked *unpacked)
+{
+	if (likely(k.k->type == KEY_TYPE_inode_v3))
+		return bch2_inode_unpack_v3(k, unpacked);
+	return bch2_inode_unpack_slowpath(k, unpacked);
+}
+
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       flags|BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (ret)
+		goto err;
+
+	ret = bch2_inode_unpack(k, inode);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	bch2_trans_iter_exit(trans, iter);
+	return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+		    struct btree_iter *iter,
+		    struct bch_inode_unpacked *inode,
+		    subvol_inum inum, unsigned flags)
+{
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+	return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode,
+		     enum btree_update_flags flags)
+{
+	struct bkey_inode_buf *inode_p;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack_inlined(inode_p, inode);
+	inode_p->inode.k.p.snapshot = iter->snapshot;
+	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
+}
+
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct bch_inode_unpacked u;
+	struct bkey_inode_buf *inode_p;
+	int ret;
+
+	if (!bkey_is_inode(&k->k))
+		return ERR_PTR(-ENOENT);
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return ERR_CAST(inode_p);
+
+	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
+	if (ret)
+		return ERR_PTR(ret);
+
+	bch2_inode_pack(inode_p, &u);
+	return &inode_p->inode.k_i;
+}
+
+static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+{
+	struct bch_inode_unpacked unpacked;
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode, c, err,
+			 inode_pos_inode_nonzero,
+			 "nonzero k.p.inode");
+
+	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
+			 inode_pos_blockdev_range,
+			 "fs inode in blockdev range");
+
+	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
+			 inode_unpack_error,
+			 "invalid variable length fields");
+
+	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
+			 inode_checksum_type_invalid,
+			 "invalid data checksum type (%u >= %u",
+			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
+
+	bkey_fsck_err_on(unpacked.bi_compression &&
+			 !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
+			 inode_compression_type_invalid,
+			 "invalid compression opt %u", unpacked.bi_compression - 1);
+
+	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+			 unpacked.bi_nlink != 0, c, err,
+			 inode_unlinked_but_nlink_nonzero,
+			 "flagged as unlinked but bi_nlink != 0");
+
+	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
+			 inode_subvol_root_but_not_dir,
+			 "subvolume root but not a directory");
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+			 inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+			 inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+	return ret;
+}
+
+int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+	int ret = 0;
+
+	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
+			 inode_v3_fields_start_bad,
+			 "invalid fields_start (got %llu, min %u max %zu)",
+			 INODEv3_FIELDS_START(inode.v),
+			 INODEv3_FIELDS_START_INITIAL,
+			 bkey_val_u64s(inode.k));
+
+	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+			 inode_str_hash_invalid,
+			 "invalid str hash type (%llu >= %u)",
+			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
+
+	ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+	return ret;
+}
+
+static void __bch2_inode_unpacked_to_text(struct printbuf *out,
+					  struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "mode=%o ", inode->bi_mode);
+
+	prt_str(out, "flags=");
+	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+	prt_printf(out, " (%x)", inode->bi_flags);
+
+	prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
+	       inode->bi_journal_seq,
+	       inode->bi_size,
+	       inode->bi_sectors,
+	       inode->bi_version);
+
+#define x(_name, _bits)						\
+	prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+	BCH_INODE_FIELDS_v3()
+#undef  x
+}
+
+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
+{
+	prt_printf(out, "inum: %llu ", inode->bi_inum);
+	__bch2_inode_unpacked_to_text(out, inode);
+}
+
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bch_inode_unpacked inode;
+
+	if (bch2_inode_unpack(k, &inode)) {
+		prt_printf(out, "(unpack error)");
+		return;
+	}
+
+	__bch2_inode_unpacked_to_text(out, &inode);
+}
+
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
+
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+	return bkey_inode_flags(k) & BCH_INODE_unlinked;
+}
+
+int bch2_trans_mark_inode(struct btree_trans *trans,
+			  enum btree_id btree_id, unsigned level,
+			  struct bkey_s_c old,
+			  struct bkey_i *new,
+			  unsigned flags)
+{
+	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
+	bool old_deleted = bkey_is_deleted_inode(old);
+	bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+
+	if (nr) {
+		int ret = bch2_replicas_deltas_realloc(trans, 0);
+		struct replicas_delta_list *d = trans->fs_usage_deltas;
+
+		if (ret)
+			return ret;
+
+		d->nr_inodes += nr;
+	}
+
+	if (old_deleted != new_deleted) {
+		int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_mark_inode(struct btree_trans *trans,
+		    enum btree_id btree_id, unsigned level,
+		    struct bkey_s_c old, struct bkey_s_c new,
+		    unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_fs_usage *fs_usage;
+	u64 journal_seq = trans->journal_res.seq;
+
+	if (flags & BTREE_TRIGGER_INSERT) {
+		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
+
+		BUG_ON(!journal_seq);
+		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
+
+		v->bi_journal_seq = cpu_to_le64(journal_seq);
+	}
+
+	if (flags & BTREE_TRIGGER_GC) {
+		percpu_down_read(&c->mark_lock);
+		preempt_disable();
+
+		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+		fs_usage->nr_inodes += bkey_is_inode(new.k);
+		fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+		preempt_enable();
+		percpu_up_read(&c->mark_lock);
+	}
+	return 0;
+}
+
+int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
+				  enum bkey_invalid_flags flags,
+				  struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode, c, err,
+			 inode_pos_inode_nonzero,
+			 "nonzero k.p.inode");
+fsck_err:
+	return ret;
+}
+
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
+				   struct bkey_s_c k)
+{
+	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
+
+	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
+}
+
+void bch2_inode_init_early(struct bch_fs *c,
+			   struct bch_inode_unpacked *inode_u)
+{
+	enum bch_str_hash_type str_hash =
+		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
+
+	memset(inode_u, 0, sizeof(*inode_u));
+
+	/* ick */
+	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
+	get_random_bytes(&inode_u->bi_hash_seed,
+			 sizeof(inode_u->bi_hash_seed));
+}
+
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+			  struct bch_inode_unpacked *parent)
+{
+	inode_u->bi_mode	= mode;
+	inode_u->bi_uid		= uid;
+	inode_u->bi_gid		= gid;
+	inode_u->bi_dev		= rdev;
+	inode_u->bi_atime	= now;
+	inode_u->bi_mtime	= now;
+	inode_u->bi_ctime	= now;
+	inode_u->bi_otime	= now;
+
+	if (parent && parent->bi_mode & S_ISGID) {
+		inode_u->bi_gid = parent->bi_gid;
+		if (S_ISDIR(mode))
+			inode_u->bi_mode |= S_ISGID;
+	}
+
+	if (parent) {
+#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
+		BCH_INODE_OPTS()
+#undef x
+	}
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	bch2_inode_init_early(c, inode_u);
+	bch2_inode_init_late(inode_u, bch2_current_time(c),
+			     uid, gid, mode, rdev, parent);
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+	case KEY_TYPE_inode_v2:
+		BUG();
+	case KEY_TYPE_inode_generation:
+		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+	default:
+		return 0;
+	}
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode_u,
+		      u32 snapshot, u64 cpu)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	u64 min, max, start, pos, *hint;
+	int ret = 0;
+	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
+
+	if (c->opts.shard_inode_numbers) {
+		bits -= c->inode_shard_bits;
+
+		min = (cpu << bits);
+		max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+		hint = c->unused_inode_hints + cpu;
+	} else {
+		min = BLOCKDEV_INODE_MAX;
+		max = ~(ULLONG_MAX << bits);
+		hint = c->unused_inode_hints;
+	}
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
+
+	pos = start;
+	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+			     BTREE_ITER_ALL_SNAPSHOTS|
+			     BTREE_ITER_INTENT);
+again:
+	while ((k = bch2_btree_iter_peek(iter)).k &&
+	       !(ret = bkey_err(k)) &&
+	       bkey_lt(k.k->p, POS(0, max))) {
+		if (pos < iter->pos.offset)
+			goto found_slot;
+
+		/*
+		 * We don't need to iterate over keys in every snapshot once
+		 * we've found just one:
+		 */
+		pos = iter->pos.offset + 1;
+		bch2_btree_iter_set_pos(iter, POS(0, pos));
+	}
+
+	if (!ret && pos < max)
+		goto found_slot;
+
+	if (!ret && start == min)
+		ret = -BCH_ERR_ENOSPC_inode_create;
+
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
+	}
+
+	/* Retry from start */
+	pos = start = min;
+	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	goto again;
+found_slot:
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret) {
+		bch2_trans_iter_exit(trans, iter);
+		return ret;
+	}
+
+	*hint			= k.k->p.offset;
+	inode_u->bi_inum	= k.k->p.offset;
+	inode_u->bi_generation	= bkey_generation(k);
+	return 0;
+}
+
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+				  subvol_inum inum, enum btree_id id)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i delete;
+	struct bpos end = POS(inum.inum, U64_MAX);
+	u32 snapshot;
+	int ret = 0;
+
+	/*
+	 * We're never going to be deleting partial extents, no need to use an
+	 * extent iterator:
+	 */
+	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+			     BTREE_ITER_INTENT);
+
+	while (1) {
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_upto(&iter, end);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k)
+			break;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter.pos;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
+			bch2_key_resize(&delete.k,
+					bpos_min(end, k.k->p).offset -
+					iter.pos.offset);
+
+		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+		      bch2_trans_commit(trans, NULL, NULL,
+					BTREE_INSERT_NOFAIL);
+err:
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * longer needed
+	 */
+	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum.inum, snapshot),
+			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum.inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+				  subvol_inum inum,
+				  struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+	if (!ret)
+		bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
+			    struct bch_inode_unpacked *inode)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_flags & BCH_INODE_unlinked)
+		bi->bi_flags &= ~BCH_INODE_unlinked;
+	else {
+		if (bi->bi_nlink == U32_MAX)
+			return -EINVAL;
+
+		bi->bi_nlink++;
+	}
+
+	return 0;
+}
+
+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
+{
+	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
+		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
+					bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_flags & BCH_INODE_unlinked) {
+		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
+		return;
+	}
+
+	if (bi->bi_nlink)
+		bi->bi_nlink--;
+	else
+		bi->bi_flags |= BCH_INODE_unlinked;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
+{
+	struct bch_opts ret = { 0 };
+#define x(_name, _bits)							\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef x
+	return ret;
+}
+
+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
+			 struct bch_inode_unpacked *inode)
+{
+#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
+	BCH_INODE_OPTS()
+#undef x
+
+	if (opts->nocow)
+		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
+}
+
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+	struct bch_inode_unpacked inode;
+	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+	if (ret)
+		return ret;
+
+	bch2_inode_opts_get(opts, trans->c, &inode);
+	return 0;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter = { NULL };
+	struct bkey_i_inode_generation delete;
+	struct bch_inode_unpacked inode_u;
+	struct bkey_s_c k;
+	int ret;
+
+	do {
+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL) ?:
+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+						      SPOS(inum, 0, snapshot),
+						      SPOS(inum, U64_MAX, snapshot),
+						      0, NULL);
+	} while (ret == -BCH_ERR_transaction_restart_nested);
+	if (ret)
+		goto err;
+retry:
+	bch2_trans_begin(trans);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!bkey_is_inode(k.k)) {
+		bch2_fs_inconsistent(c,
+				     "inode %llu:%u not found when deleting",
+				     inum, snapshot);
+		ret = -EIO;
+		goto err;
+	}
+
+	bch2_inode_unpack(k, &inode_u);
+
+	/* Subvolume root? */
+	if (inode_u.bi_subvol)
+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
+
+	bkey_inode_generation_init(&delete.k_i);
+	delete.k.p = iter.pos;
+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	return ret ?: -BCH_ERR_transaction_restart_nested;
+}
+
+static int may_delete_deleted_inode(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bpos pos,
+				    bool *need_another_pass)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
+	if (fsck_err_on(!bkey_is_inode(k.k), c,
+			deleted_inode_missing,
+			"nonexistent inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_unpack(k, &inode);
+	if (ret)
+		goto out;
+
+	if (S_ISDIR(inode.bi_mode)) {
+		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
+		if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+				"non empty directory %llu:%u in deleted_inodes btree",
+				pos.offset, pos.snapshot))
+			goto delete;
+		if (ret)
+			goto out;
+	}
+
+	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
+			deleted_inode_not_unlinked,
+			"non-deleted inode %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	if (c->sb.clean &&
+	    !fsck_err(c,
+		      deleted_inode_but_clean,
+		      "filesystem marked as clean but have deleted inode %llu:%u",
+		      pos.offset, pos.snapshot)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
+		struct bpos new_min_pos;
+
+		ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
+		if (ret)
+			goto out;
+
+		inode.bi_flags &= ~BCH_INODE_unlinked;
+
+		ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
+					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		bch_err_msg(c, ret, "clearing inode unlinked flag");
+		if (ret)
+			goto out;
+
+		/*
+		 * We'll need another write buffer flush to pick up the new
+		 * unlinked inodes in the snapshot leaves:
+		 */
+		*need_another_pass = true;
+		goto out;
+	}
+
+	ret = 1;
+out:
+fsck_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	return ret;
+delete:
+	ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+	goto out;
+}
+
+int bch2_delete_dead_inodes(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	bool need_another_pass;
+	int ret;
+again:
+	need_another_pass = false;
+
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
+	/*
+	 * Weird transaction restart handling here because on successful delete,
+	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
+	 * but we can't retry because the btree write buffer won't have been
+	 * flushed and we'd spin:
+	 */
+	for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+		ret = commit_do(trans, NULL, NULL,
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_LAZY_RW,
+			may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
+		if (ret < 0)
+			break;
+
+		if (ret) {
+			if (!test_bit(BCH_FS_RW, &c->flags)) {
+				bch2_trans_unlock(trans);
+				bch2_fs_lazy_rw(c);
+			}
+
+			bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+
+			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
+			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && need_another_pass)
+		goto again;
+err:
+	bch2_trans_put(trans);
+
+	return ret;
+}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
new file mode 100644
index 000000000000..88818a332b1e
--- /dev/null
+++ b/fs/bcachefs/inode.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "opts.h"
+
+enum bkey_invalid_flags;
+extern const char * const bch2_inode_opts[];
+
+int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+			  struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
+		    struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 16,				\
+})
+
+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v2_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 32,				\
+})
+
+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_v3_invalid,	\
+	.val_to_text	= bch2_inode_to_text,		\
+	.trans_trigger	= bch2_trans_mark_inode,	\
+	.atomic_trigger	= bch2_mark_inode,		\
+	.min_val_size	= 48,				\
+})
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+	return  k->type == KEY_TYPE_inode ||
+		k->type == KEY_TYPE_inode_v2 ||
+		k->type == KEY_TYPE_inode_v3;
+}
+
+int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
+				  enum bkey_invalid_flags, struct printbuf *);
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
+	.key_invalid	= bch2_inode_generation_invalid,	\
+	.val_to_text	= bch2_inode_generation_to_text,	\
+	.min_val_size	= 8,					\
+})
+
+#if 0
+typedef struct {
+	u64			lo;
+	u32			hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
+struct bch_inode_unpacked {
+	u64			bi_inum;
+	u64			bi_journal_seq;
+	__le64			bi_hash_seed;
+	u64			bi_size;
+	u64			bi_sectors;
+	u64			bi_version;
+	u32			bi_flags;
+	u16			bi_mode;
+
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS_v3()
+#undef  x
+};
+
+struct bkey_inode_buf {
+	struct bkey_i_inode_v3	inode;
+
+#define x(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
+#undef  x
+} __packed __aligned(8);
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
+
+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
+
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+		     struct bch_inode_unpacked *, enum btree_update_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+		     struct btree_iter *iter,
+		     struct bch_inode_unpacked *inode)
+{
+	return bch2_inode_write_flags(trans, iter, inode, 0);
+}
+
+void bch2_inode_init_early(struct bch_fs *,
+			   struct bch_inode_unpacked *);
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+			  uid_t, gid_t, umode_t, dev_t,
+			  struct bch_inode_unpacked *);
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
+
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, u32, u64);
+
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+				  subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+				  struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+			    struct bch_inode_unpacked *);
+
+#define inode_opt_get(_c, _inode, _name)			\
+	((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum inode_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
+				     enum inode_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)							\
+	case Inode_opt_##_name:						\
+		return inode->bi_##_name;
+	BCH_INODE_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
+/* i_nlink: */
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
+{
+	return bi->bi_flags & BCH_INODE_unlinked
+		  ? 0
+		  : bi->bi_nlink + nlink_bias(bi->bi_mode);
+}
+
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
+					unsigned nlink)
+{
+	if (nlink) {
+		bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
+		bi->bi_flags &= ~BCH_INODE_unlinked;
+	} else {
+		bi->bi_nlink = 0;
+		bi->bi_flags |= BCH_INODE_unlinked;
+	}
+}
+
+int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
+			 struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+
+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
+int bch2_delete_dead_inodes(struct bch_fs *);
+
+#endif /* _BCACHEFS_INODE_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
new file mode 100644
index 000000000000..bebc11444ef5
--- /dev/null
+++ b/fs/bcachefs/io_misc.c
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+			  subvol_inum inum,
+			  struct btree_iter *iter,
+			  u64 sectors,
+			  struct bch_io_opts opts,
+			  s64 *i_sectors_delta,
+			  struct write_point_specifier write_point)
+{
+	struct bch_fs *c = trans->c;
+	struct disk_reservation disk_res = { 0 };
+	struct closure cl;
+	struct open_buckets open_buckets = { 0 };
+	struct bkey_s_c k;
+	struct bkey_buf old, new;
+	unsigned sectors_allocated = 0;
+	bool have_reservation = false;
+	bool unwritten = opts.nocow &&
+	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+	int ret;
+
+	bch2_bkey_buf_init(&old);
+	bch2_bkey_buf_init(&new);
+	closure_init_stack(&cl);
+
+	k = bch2_btree_iter_peek_slot(iter);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+	if (!have_reservation) {
+		unsigned new_replicas =
+			max(0, (int) opts.data_replicas -
+			    (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+		/*
+		 * Get a disk reservation before (in the nocow case) calling
+		 * into the allocator:
+		 */
+		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+		if (unlikely(ret))
+			goto err;
+
+		bch2_bkey_buf_reassemble(&old, c, k);
+	}
+
+	if (have_reservation) {
+		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+			goto err;
+
+		bch2_key_resize(&new.k->k, sectors);
+	} else if (!unwritten) {
+		struct bkey_i_reservation *reservation;
+
+		bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+		reservation = bkey_reservation_init(new.k);
+		reservation->k.p = iter->pos;
+		bch2_key_resize(&reservation->k, sectors);
+		reservation->v.nr_replicas = opts.data_replicas;
+	} else {
+		struct bkey_i_extent *e;
+		struct bch_devs_list devs_have;
+		struct write_point *wp;
+		struct bch_extent_ptr *ptr;
+
+		devs_have.nr = 0;
+
+		bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+		e = bkey_extent_init(new.k);
+		e->k.p = iter->pos;
+
+		ret = bch2_alloc_sectors_start_trans(trans,
+				opts.foreground_target,
+				false,
+				write_point,
+				&devs_have,
+				opts.data_replicas,
+				opts.data_replicas,
+				BCH_WATERMARK_normal, 0, &cl, &wp);
+		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+			ret = -BCH_ERR_transaction_restart_nested;
+		if (ret)
+			goto err;
+
+		sectors = min_t(u64, sectors, wp->sectors_free);
+		sectors_allocated = sectors;
+
+		bch2_key_resize(&e->k, sectors);
+
+		bch2_open_bucket_get(c, wp, &open_buckets);
+		bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+		bch2_alloc_sectors_done(c, wp);
+
+		extent_for_each_ptr(extent_i_to_s(e), ptr)
+			ptr->unwritten = true;
+	}
+
+	have_reservation = true;
+
+	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+				 0, i_sectors_delta, true);
+err:
+	if (!ret && sectors_allocated)
+		bch2_increment_clock(c, sectors_allocated, WRITE);
+
+	bch2_open_buckets_put(c, &open_buckets);
+	bch2_disk_reservation_put(c, &disk_res);
+	bch2_bkey_buf_exit(&new, c);
+	bch2_bkey_buf_exit(&old, c);
+
+	if (closure_nr_remaining(&cl) != 1) {
+		bch2_trans_unlock(trans);
+		closure_sync(&cl);
+	}
+
+	return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+		   subvol_inum inum, u64 end,
+		   s64 *i_sectors_delta)
+{
+	struct bch_fs *c	= trans->c;
+	unsigned max_sectors	= KEY_SIZE_MAX & (~0 << c->block_bits);
+	struct bpos end_pos = POS(inum.inum, end);
+	struct bkey_s_c k;
+	int ret = 0, ret2 = 0;
+	u32 snapshot;
+
+	while (!ret ||
+	       bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete;
+
+		if (ret)
+			ret2 = ret;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(iter, snapshot);
+
+		/*
+		 * peek_upto() doesn't have ideal semantics for extents:
+		 */
+		k = bch2_btree_iter_peek_upto(iter, end_pos);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (ret)
+			continue;
+
+		bkey_init(&delete.k);
+		delete.k.p = iter->pos;
+
+		/* create the biggest key we can */
+		bch2_key_resize(&delete.k, max_sectors);
+		bch2_cut_back(end_pos, &delete);
+
+		ret = bch2_extent_update(trans, inum, iter, &delete,
+				&disk_res, 0, i_sectors_delta, false);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+
+	return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+		s64 *i_sectors_delta)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, start),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+
+	return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+	prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+	prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+			      subvol_inum inum,
+			      u64 new_i_size)
+{
+	struct btree_iter iter = { NULL };
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+		(inode_u.bi_size = new_i_size, 0) ?:
+		bch2_inode_write(trans, &iter, &inode_u);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+					    struct bkey_i *op_k,
+					    u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter fpunch_iter;
+	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+	int ret;
+
+	ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			truncate_set_isize(trans, inum, new_i_size));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+			     POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+			     BTREE_ITER_INTENT);
+	ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+	bch2_trans_iter_exit(trans, &fpunch_iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		ret = 0;
+err:
+	bch2_logged_op_finish(trans, op_k);
+	return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_truncate op;
+
+	bkey_logged_op_truncate_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.new_i_size	= cpu_to_le64(new_i_size);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+	prt_printf(out, "subvol=%u",		le32_to_cpu(op.v->subvol));
+	prt_printf(out, " inum=%llu",		le64_to_cpu(op.v->inum));
+	prt_printf(out, " dst_offset=%lli",	le64_to_cpu(op.v->dst_offset));
+	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+	struct btree_iter iter;
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	offset	<<= 9;
+	len	<<= 9;
+
+	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	if (len > 0) {
+		if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+			ret = -EFBIG;
+			goto err;
+		}
+
+		if (offset >= inode_u.bi_size) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	inode_u.bi_size += len;
+	inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+	ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+					   struct bkey_i *op_k,
+					   u64 *i_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+	struct bch_io_opts opts;
+	u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+	u64 src_offset = le64_to_cpu(op->v.src_offset);
+	s64 shift = dst_offset - src_offset;
+	u64 len = abs(shift);
+	u64 pos = le64_to_cpu(op->v.pos);
+	bool insert = shift > 0;
+	int ret = 0;
+
+	ret = bch2_inum_opts_get(trans, inum, &opts);
+	if (ret)
+		return ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     POS(inum.inum, 0),
+			     BTREE_ITER_INTENT);
+
+	switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+	op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+	if (insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, len) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+		if (ret)
+			goto err;
+	} else {
+		bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+		ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto err;
+
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+	while (1) {
+		struct disk_reservation disk_res =
+			bch2_disk_reservation_init(c, 0);
+		struct bkey_i delete, *copy;
+		struct bkey_s_c k;
+		struct bpos src_pos = POS(inum.inum, src_offset);
+		u32 snapshot;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		if (ret)
+			goto btree_err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+		bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+		k = insert
+			? bch2_btree_iter_peek_prev(&iter)
+			: bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+		if ((ret = bkey_err(k)))
+			goto btree_err;
+
+		if (!k.k ||
+		    k.k->p.inode != inum.inum ||
+		    bkey_le(k.k->p, POS(inum.inum, src_offset)))
+			break;
+
+		copy = bch2_bkey_make_mut_noupdate(trans, k);
+		if ((ret = PTR_ERR_OR_ZERO(copy)))
+			goto btree_err;
+
+		if (insert &&
+		    bkey_lt(bkey_start_pos(k.k), src_pos)) {
+			bch2_cut_front(src_pos, copy);
+
+			/* Splitting compressed extent? */
+			bch2_disk_reservation_add(c, &disk_res,
+					copy->k.size *
+					bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+					BCH_DISK_RESERVATION_NOFAIL);
+		}
+
+		bkey_init(&delete.k);
+		delete.k.p = copy->k.p;
+		delete.k.p.snapshot = snapshot;
+		delete.k.size = copy->k.size;
+
+		copy->k.p.offset += shift;
+		copy->k.p.snapshot = snapshot;
+
+		op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+		ret =   bch2_bkey_set_needs_rebalance(c, copy,
+					opts.background_target,
+					opts.background_compression) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+			bch2_logged_op_update(trans, &op->k_i) ?:
+			bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+
+		pos = le64_to_cpu(op->v.pos);
+	}
+
+	op->v.state = LOGGED_OP_FINSERT_finish;
+
+	if (!insert) {
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, src_offset, shift) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	} else {
+		/* We need an inode update to update bi_journal_seq for fsync: */
+		ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+				adjust_i_size(trans, inum, 0, 0) ?:
+				bch2_logged_op_update(trans, &op->k_i));
+	}
+
+	break;
+case LOGGED_OP_FINSERT_finish:
+	break;
+	}
+err:
+	bch2_logged_op_finish(trans, op_k);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+	return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+			   u64 offset, u64 len, bool insert,
+			   s64 *i_sectors_delta)
+{
+	struct bkey_i_logged_op_finsert op;
+	s64 shift = insert ? len : -len;
+
+	bkey_logged_op_finsert_init(&op.k_i);
+	op.v.subvol	= cpu_to_le32(inum.subvol);
+	op.v.inum	= cpu_to_le64(inum.inum);
+	op.v.dst_offset	= cpu_to_le64(offset + shift);
+	op.v.src_offset	= cpu_to_le64(offset);
+	op.v.pos	= cpu_to_le64(insert ? U64_MAX : offset);
+
+	/*
+	 * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+	 * snapshot while they're in progress, then crashing, will result in the
+	 * resume only proceeding in one of the snapshots
+	 */
+	down_read(&c->snapshot_create_lock);
+	int ret = bch2_trans_run(c,
+		bch2_logged_op_start(trans, &op.k_i) ?:
+		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+	up_read(&c->snapshot_create_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
new file mode 100644
index 000000000000..9cb44a7c43c1
--- /dev/null
+++ b/fs/bcachefs/io_misc.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+			  u64, struct bch_io_opts, s64 *,
+			  struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+		   subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_truncate_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {	\
+	.val_to_text	= bch2_logged_op_finsert_to_text,	\
+	.min_val_size	= 24,					\
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
new file mode 100644
index 000000000000..36763865facd
--- /dev/null
+++ b/fs/bcachefs/io_read.c
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+	return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+	struct rcu_head		rcu;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
+	struct data_update	write;
+	struct bio_vec		bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
+{
+	BUG_ON(!opts.promote_target);
+
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return -BCH_ERR_nopromote_may_not;
+
+	if (bch2_bkey_has_target(c, k, opts.promote_target))
+		return -BCH_ERR_nopromote_already_promoted;
+
+	if (bkey_extent_is_unwritten(k))
+		return -BCH_ERR_nopromote_unwritten;
+
+	if (bch2_target_congested(c, opts.promote_target))
+		return -BCH_ERR_nopromote_congested;
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return -BCH_ERR_nopromote_in_flight;
+
+	return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	bch2_data_update_exit(&op->write);
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+	struct promote_op *op =
+		container_of(wop, struct promote_op, write.op);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+			       op->start_time);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+					  enum btree_id btree_id,
+					  struct bkey_s_c k,
+					  struct bpos pos,
+					  struct extent_ptr_decoded *pick,
+					  struct bch_io_opts opts,
+					  unsigned sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct bch_fs *c = trans->c;
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	int ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * We don't use the mempool here because extents that aren't
+	 * checksummed or compressed can be too big for the mempool:
+	 */
+	*rbio = kzalloc(sizeof(struct bch_read_bio) +
+			sizeof(struct bio_vec) * pages,
+			GFP_NOFS);
+	if (!*rbio)
+		goto err;
+
+	rbio_init(&(*rbio)->bio, opts);
+	bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+	if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+				 GFP_NOFS))
+		goto err;
+
+	(*rbio)->bounce		= true;
+	(*rbio)->split		= true;
+	(*rbio)->kmalloc	= true;
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			(struct data_update_opts) {
+				.target		= opts.promote_target,
+				.extra_replicas	= 1,
+				.write_flags	= BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+			},
+			btree_id, k);
+	/*
+	 * possible errors: -BCH_ERR_nocow_lock_blocked,
+	 * -BCH_ERR_ENOSPC_disk_reservation:
+	 */
+	if (ret) {
+		ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+					bch_promote_params);
+		BUG_ON(ret);
+		goto err;
+	}
+
+	op->write.op.end_io = promote_done;
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+	return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_ptr_decoded *pick,
+					struct bch_io_opts opts,
+					unsigned flags,
+					struct bch_read_bio **rbio,
+					bool *bounce,
+					bool *read_full)
+{
+	struct bch_fs *c = trans->c;
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	/* data might have to be decompressed in the write path: */
+	unsigned sectors = promote_full
+		? max(pick->crc.compressed_size, pick->crc.live_size)
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+	int ret;
+
+	ret = should_promote(c, k, pos, opts, flags);
+	if (ret)
+		goto nopromote;
+
+	promote = __promote_alloc(trans,
+				  k.k->type == KEY_TYPE_reflink_v
+				  ? BTREE_ID_reflink
+				  : BTREE_ID_extents,
+				  k, pos, pick, opts, sectors, rbio);
+	if (!promote) {
+		ret = -BCH_ERR_nopromote_enomem;
+		goto nopromote;
+	}
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
+nopromote:
+	trace_read_nopromote(c, ret);
+	return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
+			   struct workqueue_struct *wq)
+{
+	if (context <= rbio->context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->context		= context;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	BUG_ON(rbio->bounce && !rbio->split);
+
+	if (rbio->promote)
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->start_time)
+		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+				       rbio->start_time);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter,
+				     struct bch_io_failures *failed,
+				     unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+
+	bch2_bkey_buf_init(&sk);
+
+	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+			     rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (bkey_err(k))
+		goto err;
+
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+	bch2_trans_unlock(trans);
+
+	if (!bch2_bkey_matches_ptr(c, k,
+				   rbio->pick.ptr,
+				   rbio->data_pos.offset -
+				   rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(trans, rbio, bvec_iter,
+				 rbio->read_pos,
+				 rbio->data_btree,
+				 k, 0, failed, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+out:
+	bch2_rbio_done(rbio);
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+	return;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+	goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	subvol_inum inum = {
+		.subvol = rbio->subvol,
+		.inum	= rbio->read_pos.inode,
+	};
+	struct bch_io_failures failed = { .nr = 0 };
+
+	trace_and_count(c, read_retry, &rbio->bio);
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		bch2_mark_io_failure(&failed, &rbio->pick);
+
+	rbio->bio.bi_status = 0;
+
+	rbio = bch2_rbio_free(rbio);
+
+	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
+
+	if (flags & BCH_READ_NODECODE) {
+		bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+	} else {
+		flags &= ~BCH_READ_LAST_FRAGMENT;
+		flags |= BCH_READ_MUST_CLONE;
+
+		__bch2_read(c, rbio, iter, inum, &failed, flags);
+	}
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+			    blk_status_t error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+				   struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+	struct bch_extent_crc_unpacked new_crc;
+	struct btree_iter iter;
+	struct bkey_i *new;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	if (crc_is_compressed(rbio->pick.crc))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if ((ret = bkey_err(k)))
+		goto out;
+
+	if (bversion_cmp(k.k->version, rbio->version) ||
+	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(k.k) < data_offset ||
+	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+			rbio->pick.crc, NULL, &new_crc,
+			bkey_start_offset(k.k) - data_offset, k.k->size,
+			rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * going to be temporarily appending another checksum entry:
+	 */
+	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+				 sizeof(struct bch_extent_crc128));
+	if ((ret = PTR_ERR_OR_ZERO(new)))
+		goto out;
+
+	bkey_reassemble(new, k);
+
+	if (!bch2_bkey_narrow_crcs(new, new_crc))
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, new,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+		      __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
+	unsigned nofs_flags;
+	struct bch_csum csum;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	/* Reset iterator for checksumming and copying bounced data: */
+	if (rbio->bounce) {
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
+	} else {
+		src->bi_iter			= rbio->bvec_iter;
+	}
+
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+		goto csum_err;
+
+	/*
+	 * XXX
+	 * We need to rework the narrow_crcs path to deliver the read completion
+	 * first, and then punt to a different workqueue, otherwise we're
+	 * holding up reads while doing btree updates which is bad for memory
+	 * reclaim.
+	 */
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
+
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
+
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->offset_into_extent;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
+
+	if (crc_is_compressed(crc)) {
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+		    !c->opts.no_data_io)
+			goto decompression_err;
+	} else {
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
+
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
+
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+		}
+	}
+
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (ret)
+			goto decrypt_err;
+
+		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
+	}
+nodecode:
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
+		bch2_rbio_done(rbio);
+	}
+out:
+	memalloc_nofs_restore(nofs_flags);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+		goto out;
+	}
+
+	bch_err_inum_offset_ratelimited(ca,
+		rbio->read_pos.inode,
+		rbio->read_pos.offset << 9,
+		"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+	bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+	goto out;
+decompression_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decompression error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+decrypt_err:
+	bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+					rbio->read_pos.offset << 9,
+					"decrypt error");
+	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+	goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+	struct bch_read_bio *rbio =
+		container_of(bio, struct bch_read_bio, bio);
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+				    rbio->read_pos.inode,
+				    rbio->read_pos.offset,
+				    "data read error: %s",
+			       bch2_blk_status_to_str(bio->bi_status))) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+		return;
+	}
+
+	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+	    ptr_stale(ca, &rbio->pick.ptr)) {
+		trace_and_count(c, read_reuse_race, &rbio->bio);
+
+		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+			bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+		else
+			bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+		return;
+	}
+
+	if (rbio->narrow_crcs ||
+	    rbio->promote ||
+	    crc_is_compressed(rbio->pick.crc) ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
+
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+				unsigned *offset_into_extent,
+				struct bkey_buf *orig_k)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 reflink_offset;
+	int ret;
+
+	reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+		*offset_into_extent;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+			       POS(0, reflink_offset), 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (k.k->type != KEY_TYPE_reflink_v &&
+	    k.k->type != KEY_TYPE_indirect_inline_data) {
+		bch_err_inum_offset_ratelimited(trans->c,
+			orig_k->k->k.p.inode,
+			orig_k->k->k.p.offset << 9,
+			"%llu len %u points to nonexistent indirect extent %llu",
+			orig_k->k->k.p.offset,
+			orig_k->k->k.size,
+			reflink_offset);
+		bch2_inconsistent_error(trans->c);
+		ret = -EIO;
+		goto err;
+	}
+
+	*offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+	bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+						   struct bkey_s_c k,
+						   struct bch_extent_ptr ptr)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+	struct btree_iter iter;
+	struct printbuf buf = PRINTBUF;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     PTR_BUCKET_POS(c, &ptr),
+			     BTREE_ITER_CACHED);
+
+	prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+	printbuf_indent_add(&buf, 2);
+	prt_newline(&buf);
+
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_newline(&buf);
+
+	prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	if (!ret) {
+		prt_newline(&buf);
+		bch2_bkey_val_to_text(&buf, c, k);
+	}
+
+	bch2_fs_inconsistent(c, "%s", buf.buf);
+
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bpos read_pos,
+		       enum btree_id data_btree, struct bkey_s_c k,
+		       unsigned offset_into_extent,
+		       struct bch_io_failures *failed, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct extent_ptr_decoded pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca = NULL;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos data_pos = bkey_start_pos(k.k);
+	int pick_ret;
+
+	if (bkey_extent_is_inline_data(k.k)) {
+		unsigned bytes = min_t(unsigned, iter.bi_size,
+				       bkey_inline_data_bytes(k.k));
+
+		swap(iter.bi_size, bytes);
+		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+		swap(iter.bi_size, bytes);
+		bio_advance_iter(&orig->bio, &iter, bytes);
+		zero_fill_bio_iter(&orig->bio, iter);
+		goto out_read_done;
+	}
+retry_pick:
+	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0) {
+		bch_err_inum_offset_ratelimited(c,
+				read_pos.inode, read_pos.offset << 9,
+				"no device to read from");
+		goto err;
+	}
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+	/*
+	 * Stale dirty pointers are treated as IO errors, but @failed isn't
+	 * allocated unless we're in the retry path - so if we're not in the
+	 * retry path, don't check here, it'll be caught in bch2_read_endio()
+	 * and we'll end up in the retry path:
+	 */
+	if ((flags & BCH_READ_IN_RETRY) &&
+	    !pick.ptr.cached &&
+	    unlikely(ptr_stale(ca, &pick.ptr))) {
+		read_from_stale_dirty_pointer(trans, k, pick.ptr);
+		bch2_mark_io_failure(failed, &pick);
+		goto retry_pick;
+	}
+
+	/*
+	 * Unlock the iterator while the btree node's lock is still in
+	 * cache, before doing the IO:
+	 */
+	bch2_trans_unlock(trans);
+
+	if (flags & BCH_READ_NODECODE) {
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_size	= pick.crc.compressed_size << 9;
+		goto get_bio;
+	}
+
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(k, pick.crc);
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+	if (crc_is_compressed(pick.crc) ||
+	    (pick.crc.csum_type != BCH_CSUM_none &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+	       (flags & BCH_READ_USER_MAPPED)) ||
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
+		read_full = true;
+		bounce = true;
+	}
+
+	if (orig->opts.promote_target)
+		promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+					&rbio, &bounce, &read_full);
+
+	if (!read_full) {
+		EBUG_ON(crc_is_compressed(pick.crc));
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
+			 offset_into_extent));
+
+		data_pos.offset += offset_into_extent;
+		pick.ptr.offset += pick.crc.offset +
+			offset_into_extent;
+		offset_into_extent		= 0;
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
+	}
+get_bio:
+	if (rbio) {
+		/*
+		 * promote already allocated bounce rbio:
+		 * promote needs to allocate a bio big enough for uncompressing
+		 * data in the write path, but we're not going to use it all
+		 * here:
+		 */
+		EBUG_ON(rbio->bio.bi_iter.bi_size <
+		       pick.crc.compressed_size << 9);
+		rbio->bio.bi_iter.bi_size =
+			pick.crc.compressed_size << 9;
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
+
+		rbio = rbio_init(bio_alloc_bioset(NULL,
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  0,
+						  GFP_NOFS,
+						  &c->bio_read_split),
+				 orig->opts);
+
+		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+		rbio->bounce	= true;
+		rbio->split	= true;
+	} else if (flags & BCH_READ_MUST_CLONE) {
+		/*
+		 * Have to clone if there were any splits, due to error
+		 * reporting issues (if a split errored, and retrying didn't
+		 * work, when it reports the error to its parent (us) we don't
+		 * know if the error was from our bio, and we should retry, or
+		 * from the whole bio, in which case we don't want to retry and
+		 * lose the error)
+		 */
+		rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+						 &c->bio_read_split),
+				 orig->opts);
+		rbio->bio.bi_iter = iter;
+		rbio->split	= true;
+	} else {
+		rbio = orig;
+		rbio->bio.bi_iter = iter;
+		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+	}
+
+	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+	rbio->c			= c;
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
+		rbio->parent	= orig;
+	else
+		rbio->end_io	= orig->bio.bi_end_io;
+	rbio->bvec_iter		= iter;
+	rbio->offset_into_extent= offset_into_extent;
+	rbio->flags		= flags;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
+	rbio->retry		= 0;
+	rbio->context		= 0;
+	/* XXX: only initialize this if needed */
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
+	rbio->subvol		= orig->subvol;
+	rbio->read_pos		= read_pos;
+	rbio->data_btree	= data_btree;
+	rbio->data_pos		= data_pos;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
+	INIT_WORK(&rbio->work, NULL);
+
+	rbio->bio.bi_opf	= orig->bio.bi_opf;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+	rbio->bio.bi_end_io	= bch2_read_endio;
+
+	if (rbio->bounce)
+		trace_and_count(c, read_bounce, &rbio->bio);
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+	/*
+	 * If it's being moved internally, we don't want to flag it as a cache
+	 * hit:
+	 */
+	if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+		bio_inc_remaining(&orig->bio);
+		trace_and_count(c, read_split, &orig->bio);
+	}
+
+	if (!rbio->pick.idx) {
+		if (!rbio->have_ioref) {
+			bch_err_inum_offset_ratelimited(c,
+					read_pos.inode,
+					read_pos.offset << 9,
+					"no device to read from");
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+			     bio_sectors(&rbio->bio));
+		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+		if (unlikely(c->opts.no_data_io)) {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				bio_endio(&rbio->bio);
+		} else {
+			if (likely(!(flags & BCH_READ_IN_RETRY)))
+				submit_bio(&rbio->bio);
+			else
+				submit_bio_wait(&rbio->bio);
+		}
+
+		/*
+		 * We just submitted IO which may block, we expect relock fail
+		 * events and shouldn't count them:
+		 */
+		trans->notrace_relock_fail = true;
+	} else {
+		/* Attempting reconstruct read: */
+		if (bch2_ec_read_extent(trans, rbio)) {
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+			goto out;
+		}
+
+		if (likely(!(flags & BCH_READ_IN_RETRY)))
+			bio_endio(&rbio->bio);
+	}
+out:
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		return 0;
+	} else {
+		int ret;
+
+		rbio->context = RBIO_CONTEXT_UNBOUND;
+		bch2_read_endio(&rbio->bio);
+
+		ret = rbio->retry;
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			bch2_mark_io_failure(failed, &pick);
+			ret = READ_RETRY;
+		}
+
+		if (!ret)
+			goto out_read_done;
+
+		return ret;
+	}
+
+err:
+	if (flags & BCH_READ_IN_RETRY)
+		return READ_ERR;
+
+	orig->bio.bi_status = BLK_STS_IOERR;
+	goto out_read_done;
+
+hole:
+	/*
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
+	 */
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, subvol_inum inum,
+		 struct bch_io_failures *failed, unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	BUG_ON(flags & BCH_READ_NODECODE);
+
+	bch2_bkey_buf_init(&sk);
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		unsigned bytes, sectors, offset_into_extent;
+		enum btree_id data_btree = BTREE_ID_extents;
+
+		/*
+		 * read_extent -> io_time_reset may cause a transaction restart
+		 * without returning an error, we need to check for that here:
+		 */
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			break;
+
+		bch2_btree_iter_set_pos(&iter,
+				POS(inum.inum, bvec_iter.bi_sector));
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		offset_into_extent = iter.pos.offset -
+			bkey_start_offset(k.k);
+		sectors = k.k->size - offset_into_extent;
+
+		bch2_bkey_buf_reassemble(&sk, c, k);
+
+		ret = bch2_read_indirect_extent(trans, &data_btree,
+					&offset_into_extent, &sk);
+		if (ret)
+			break;
+
+		k = bkey_i_to_s_c(sk.k);
+
+		/*
+		 * With indirect extents, the amount of data to read is the min
+		 * of the original extent and the indirect extent:
+		 */
+		sectors = min(sectors, k.k->size - offset_into_extent);
+
+		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+		swap(bvec_iter.bi_size, bytes);
+
+		if (bvec_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
+		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+					 data_btree, k,
+					 offset_into_extent, failed, flags);
+		if (ret)
+			break;
+
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			break;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+		ret = btree_trans_too_many_iters(trans);
+		if (ret)
+			break;
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+	    ret == READ_RETRY ||
+	    ret == READ_RETRY_AVOID)
+		goto retry;
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c, inum.inum,
+						bvec_iter.bi_sector << 9,
+						"read error %i from btree lookup", ret);
+		rbio->bio.bi_status = BLK_STS_IOERR;
+		bch2_rbio_done(rbio);
+	}
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_init;
+
+	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+	if (rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -BCH_ERR_ENOMEM_promote_table_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
new file mode 100644
index 000000000000..d9c18bb7d403
--- /dev/null
+++ b/fs/bcachefs/io_read.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	unsigned		offset_into_extent;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_ptr_decoded pick;
+
+	/*
+	 * pos we read from - different from data_pos for indirect extents:
+	 */
+	u32			subvol;
+	struct bpos		read_pos;
+
+	/*
+	 * start pos of data we read (may not be pos of data we want) - for
+	 * promote, narrow extents paths:
+	 */
+	enum btree_id		data_btree;
+	struct bpos		data_pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+#define to_rbio(_bio)		container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+				struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+					    enum btree_id *data_btree,
+					    unsigned *offset_into_extent,
+					    struct bkey_buf *k)
+{
+	if (k->k->k.type != KEY_TYPE_reflink_p)
+		return 0;
+
+	*data_btree = BTREE_ID_reflink;
+	return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+		       struct bvec_iter, struct bpos, enum btree_id,
+		       struct bkey_s_c, unsigned,
+		       struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+			struct bch_read_bio *rbio, struct bpos read_pos,
+			enum btree_id data_btree, struct bkey_s_c k,
+			unsigned offset_into_extent, unsigned flags)
+{
+	__bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+			   data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     subvol_inum inum)
+{
+	struct bch_io_failures failed = { .nr = 0 };
+
+	BUG_ON(rbio->_state);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+	rbio->subvol = inum.subvol;
+
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
new file mode 100644
index 000000000000..8c8cb1541ac9
--- /dev/null
+++ b/fs/bcachefs/io_write.c
@@ -0,0 +1,1675 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
+	u64 now = local_clock();
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0U << 5))
+			break;
+
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, iter)
+		if (bv->bv_page != ZERO_PAGE(0))
+			mempool_free(bv->bv_page, &c->bio_bounce_pages);
+	bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+	struct page *page;
+
+	if (likely(!*using_mempool)) {
+		page = alloc_page(GFP_NOFS);
+		if (unlikely(!page)) {
+			mutex_lock(&c->bio_bounce_pages_lock);
+			*using_mempool = true;
+			goto pool_alloc;
+
+		}
+	} else {
+pool_alloc:
+		page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+	}
+
+	return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+			       size_t size)
+{
+	bool using_mempool = false;
+
+	while (size) {
+		struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, 0));
+		size -= len;
+	}
+
+	if (using_mempool)
+		mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+			       struct btree_iter *extent_iter,
+			       struct bkey_i *new,
+			       bool *usage_increasing,
+			       s64 *i_sectors_delta,
+			       s64 *disk_sectors_delta)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c old;
+	unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+	bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+	int ret = 0;
+
+	*usage_increasing	= false;
+	*i_sectors_delta	= 0;
+	*disk_sectors_delta	= 0;
+
+	bch2_trans_copy_iter(&iter, extent_iter);
+
+	for_each_btree_key_upto_continue_norestart(iter,
+				new->k.p, BTREE_ITER_SLOTS, old, ret) {
+		s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+			max(bkey_start_offset(&new->k),
+			    bkey_start_offset(old.k));
+
+		*i_sectors_delta += sectors *
+			(bkey_extent_is_allocation(&new->k) -
+			 bkey_extent_is_allocation(old.k));
+
+		*disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+		*disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+			? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+			: 0;
+
+		if (!*usage_increasing &&
+		    (new->k.p.snapshot != old.k->p.snapshot ||
+		     new_replicas > bch2_bkey_replicas(c, old) ||
+		     (!new_compressed && bch2_bkey_sectors_compressed(old))))
+			*usage_increasing = true;
+
+		if (bkey_ge(old.k->p, new->k.p))
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+						    struct btree_iter *extent_iter,
+						    u64 new_i_size,
+						    s64 i_sectors_delta)
+{
+	struct btree_iter iter;
+	struct bkey_i *k;
+	struct bkey_i_inode_v3 *inode;
+	/*
+	 * Crazy performance optimization:
+	 * Every extent update needs to also update the inode: the inode trigger
+	 * will set bi->journal_seq to the journal sequence number of this
+	 * transaction - for fsync.
+	 *
+	 * But if that's the only reason we're updating the inode (we're not
+	 * updating bi_size or bi_sectors), then we don't need the inode update
+	 * to be journalled - if we crash, the bi_journal_seq update will be
+	 * lost, but that's fine.
+	 */
+	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+	int ret;
+
+	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+			      SPOS(0,
+				   extent_iter->pos.inode,
+				   extent_iter->snapshot),
+			      BTREE_ITER_CACHED);
+	ret = PTR_ERR_OR_ZERO(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+		k = bch2_inode_to_v3(trans, k);
+		ret = PTR_ERR_OR_ZERO(k);
+		if (unlikely(ret))
+			goto err;
+	}
+
+	inode = bkey_i_to_inode_v3(k);
+
+	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
+	    new_i_size > le64_to_cpu(inode->v.bi_size)) {
+		inode->v.bi_size = cpu_to_le64(new_i_size);
+		inode_update_flags = 0;
+	}
+
+	if (i_sectors_delta) {
+		le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+		inode_update_flags = 0;
+	}
+
+	if (inode->k.p.snapshot != iter.snapshot) {
+		inode->k.p.snapshot = iter.snapshot;
+		inode_update_flags = 0;
+	}
+
+	ret = bch2_trans_update(trans, &iter, &inode->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+				inode_update_flags);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+		       subvol_inum inum,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       struct disk_reservation *disk_res,
+		       u64 new_i_size,
+		       s64 *i_sectors_delta_total,
+		       bool check_enospc)
+{
+	struct bpos next_pos;
+	bool usage_increasing;
+	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+	int ret;
+
+	/*
+	 * This traverses us the iterator without changing iter->path->pos to
+	 * search_key() (which is pos + 1 for extents): we want there to be a
+	 * path already traversed at iter->pos because
+	 * bch2_trans_extent_update() will use it to attempt extent merging
+	 */
+	ret = __bch2_btree_iter_traverse(iter);
+	if (ret)
+		return ret;
+
+	ret = bch2_extent_trim_atomic(trans, iter, k);
+	if (ret)
+		return ret;
+
+	next_pos = k->k.p;
+
+	ret = bch2_sum_sector_overwrites(trans, iter, k,
+			&usage_increasing,
+			&i_sectors_delta,
+			&disk_sectors_delta);
+	if (ret)
+		return ret;
+
+	if (disk_res &&
+	    disk_sectors_delta > (s64) disk_res->sectors) {
+		ret = bch2_disk_reservation_add(trans->c, disk_res,
+					disk_sectors_delta - disk_res->sectors,
+					!check_enospc || !usage_increasing
+					? BCH_DISK_RESERVATION_NOFAIL : 0);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Note:
+	 * We always have to do an inode update - even when i_size/i_sectors
+	 * aren't changing - for fsync to work properly; fsync relies on
+	 * inode->bi_journal_seq which is updated by the trigger code:
+	 */
+	ret =   bch2_extent_update_i_size_sectors(trans, iter,
+						  min(k->k.p.offset << 9, new_i_size),
+						  i_sectors_delta) ?:
+		bch2_trans_update(trans, iter, k, 0) ?:
+		bch2_trans_commit(trans, disk_res, NULL,
+				BTREE_INSERT_NOCHECK_RW|
+				BTREE_INSERT_NOFAIL);
+	if (unlikely(ret))
+		return ret;
+
+	if (i_sectors_delta_total)
+		*i_sectors_delta_total += i_sectors_delta;
+	bch2_btree_iter_set_pos(iter, next_pos);
+	return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_buf sk;
+	struct keylist *keys = &op->insert_keys;
+	struct bkey_i *k = bch2_keylist_front(keys);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	subvol_inum inum = {
+		.subvol = op->subvol,
+		.inum	= k->k.p.inode,
+	};
+	int ret;
+
+	BUG_ON(!inum.subvol);
+
+	bch2_bkey_buf_init(&sk);
+
+	do {
+		bch2_trans_begin(trans);
+
+		k = bch2_keylist_front(keys);
+		bch2_bkey_buf_copy(&sk, c, k);
+
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
+						  &sk.k->k.p.snapshot);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+				     bkey_start_pos(&sk.k->k),
+				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+		ret =   bch2_bkey_set_needs_rebalance(c, sk.k,
+					op->opts.background_target,
+					op->opts.background_compression) ?:
+			bch2_extent_update(trans, inum, &iter, sk.k,
+					&op->res,
+					op->new_i_size, &op->i_sectors_delta,
+					op->flags & BCH_WRITE_CHECK_ENOSPC);
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(iter.pos, k->k.p))
+			bch2_keylist_pop_front(&op->insert_keys);
+		else
+			bch2_cut_front(iter.pos, k);
+	} while (!bch2_keylist_empty(keys));
+
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+			       enum bch_data_type type,
+			       const struct bkey_i *k,
+			       bool nocow)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+	const struct bch_extent_ptr *ptr;
+	struct bch_write_bio *n;
+	struct bch_dev *ca;
+
+	BUG_ON(c->opts.nochanges);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (to_entry(ptr + 1) < ptrs.end) {
+			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+						GFP_NOFS, &ca->replica_set));
+
+			n->bio.bi_end_io	= wbio->bio.bi_end_io;
+			n->bio.bi_private	= wbio->bio.bi_private;
+			n->parent		= wbio;
+			n->split		= true;
+			n->bounce		= false;
+			n->put_bio		= true;
+			n->bio.bi_opf		= wbio->bio.bi_opf;
+			bio_inc_remaining(&wbio->bio);
+		} else {
+			n = wbio;
+			n->split		= false;
+		}
+
+		n->c			= c;
+		n->dev			= ptr->dev;
+		n->have_ioref		= nocow || bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
+		n->nocow		= nocow;
+		n->submit_time		= local_clock();
+		n->inode_offset		= bkey_start_offset(&k->k);
+		n->bio.bi_iter.bi_sector = ptr->offset;
+
+		if (likely(n->have_ioref)) {
+			this_cpu_add(ca->io_done->sectors[WRITE][type],
+				     bio_sectors(&n->bio));
+
+			bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+			if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+				bio_endio(&n->bio);
+				continue;
+			}
+
+			submit_bio(&n->bio);
+		} else {
+			n->bio.bi_status	= BLK_STS_REMOVED;
+			bio_endio(&n->bio);
+		}
+	}
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	EBUG_ON(op->open_buckets.nr);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+	bch2_disk_reservation_put(c, &op->res);
+
+	if (!(op->flags & BCH_WRITE_MOVE))
+		bch2_write_ref_put(c, BCH_WRITE_REF_write);
+	bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+	EBUG_ON(cl->parent);
+	closure_debug_destroy(cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+	struct keylist *keys = &op->insert_keys;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *src, *dst = keys->keys, *n;
+
+	for (src = keys->keys; src != keys->top; src = n) {
+		n = bkey_next(src);
+
+		if (bkey_extent_is_direct_data(&src->k)) {
+			bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+					    test_bit(ptr->dev, op->failed.d));
+
+			if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+				return -EIO;
+		}
+
+		if (dst != src)
+			memmove_u64s_down(dst, src, src->k.u64s);
+		dst = bkey_next(dst);
+	}
+
+	keys->top = dst;
+	return 0;
+}
+
+/**
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:		bch_write_op to process
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct keylist *keys = &op->insert_keys;
+	unsigned dev;
+	int ret = 0;
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		ret = bch2_write_drop_io_error_ptrs(op);
+		if (ret)
+			goto err;
+	}
+
+	if (!bch2_keylist_empty(keys)) {
+		u64 sectors_start = keylist_sectors(keys);
+
+		ret = !(op->flags & BCH_WRITE_MOVE)
+			? bch2_write_index_default(op)
+			: bch2_data_update_index_update(op);
+
+		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+		BUG_ON(keylist_sectors(keys) && !ret);
+
+		op->written += sectors_start - keylist_sectors(keys);
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				insert->k.p.inode, insert->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret)
+			goto err;
+	}
+out:
+	/* If some a bucket wasn't written, we can't erasure code it: */
+	for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+		bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+	bch2_open_buckets_put(c, &op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	op->flags |= BCH_WRITE_DONE;
+	goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+	if (state != wp->state) {
+		u64 now = ktime_get_ns();
+
+		if (wp->last_state_change &&
+		    time_after64(now, wp->last_state_change))
+			wp->time[wp->state] += now - wp->last_state_change;
+		wp->state = state;
+		wp->last_state_change = now;
+	}
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+	enum write_point_state state;
+
+	state = running			 ? WRITE_POINT_running :
+		!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+					 : WRITE_POINT_stopped;
+
+	__wp_update_state(wp, state);
+}
+
+static CLOSURE_CALLBACK(bch2_write_index)
+{
+	closure_type(op, struct bch_write_op, cl);
+	struct write_point *wp = op->wp;
+	struct workqueue_struct *wq = index_update_wq(op);
+	unsigned long flags;
+
+	if ((op->flags & BCH_WRITE_DONE) &&
+	    (op->flags & BCH_WRITE_MOVE))
+		bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+	spin_lock_irqsave(&wp->writes_lock, flags);
+	if (wp->state == WRITE_POINT_waiting_io)
+		__wp_update_state(wp, WRITE_POINT_waiting_work);
+	list_add_tail(&op->wp_list, &wp->writes);
+	spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+	queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+	op->wp = wp;
+
+	if (wp->state == WRITE_POINT_stopped) {
+		spin_lock_irq(&wp->writes_lock);
+		__wp_update_state(wp, WRITE_POINT_waiting_io);
+		spin_unlock_irq(&wp->writes_lock);
+	}
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+	struct write_point *wp =
+		container_of(work, struct write_point, index_update_work);
+	struct bch_write_op *op;
+
+	while (1) {
+		spin_lock_irq(&wp->writes_lock);
+		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+		if (op)
+			list_del(&op->wp_list);
+		wp_update_state(wp, op != NULL);
+		spin_unlock_irq(&wp->writes_lock);
+
+		if (!op)
+			break;
+
+		op->flags |= BCH_WRITE_IN_WORKER;
+
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			__bch2_write(op);
+		else
+			bch2_write_done(&op->cl);
+	}
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+	struct closure *cl		= bio->bi_private;
+	struct bch_write_op *op		= container_of(cl, struct bch_write_op, cl);
+	struct bch_write_bio *wbio	= to_wbio(bio);
+	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
+	struct bch_fs *c		= wbio->c;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
+
+	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+				    op->pos.inode,
+				    wbio->inode_offset << 9,
+				    "data write error: %s",
+				    bch2_blk_status_to_str(bio->bi_status))) {
+		set_bit(wbio->dev, op->failed.d);
+		op->flags |= BCH_WRITE_IO_ERROR;
+	}
+
+	if (wbio->nocow)
+		set_bit(wbio->dev, op->devs_need_flush->d);
+
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
+		percpu_ref_put(&ca->io_ref);
+	}
+
+	if (wbio->bounce)
+		bch2_bio_free_pages_pool(c, bio);
+
+	if (wbio->put_bio)
+		bio_put(bio);
+
+	if (parent)
+		bio_endio(&parent->bio);
+	else
+		closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
+{
+	struct bkey_i_extent *e;
+
+	op->pos.offset += crc.uncompressed_size;
+
+	e = bkey_extent_init(op->insert_keys.top);
+	e->k.p		= op->pos;
+	e->k.size	= crc.uncompressed_size;
+	e->k.version	= version;
+
+	if (crc.csum_type ||
+	    crc.compression_type ||
+	    crc.nonce)
+		bch2_extent_crc_append(&e->k_i, crc);
+
+	bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+				       op->flags & BCH_WRITE_CACHED);
+
+	bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed,
+					void *buf)
+{
+	struct bch_write_bio *wbio;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available +
+				      (buf
+				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
+				       : 0), PAGE_SIZE);
+
+	pages = min(pages, BIO_MAX_VECS);
+
+	bio = bio_alloc_bioset(NULL, pages, 0,
+			       GFP_NOFS, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	if (buf) {
+		bch2_bio_map(bio, buf, output_available);
+		return bio;
+	}
+
+	wbio->bounce		= true;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	bch2_bio_alloc_pages_pool(c, bio,
+				  min_t(unsigned, output_available,
+					c->opts.encoded_extent_max));
+
+	if (bio->bi_iter.bi_size < output_available)
+		*page_alloc_failed =
+			bch2_bio_alloc_pages(bio,
+					     output_available -
+					     bio->bi_iter.bi_size,
+					     GFP_NOFS) != 0;
+
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
+
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+	int ret;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+		return -EIO;
+
+	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return ret;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
+
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
+
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+	     op->incompressible)) {
+		if (!crc_is_compressed(op->crc) &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type) &&
+		    !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
+	}
+
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (crc_is_compressed(op->crc)) {
+		struct bch_csum csum;
+
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
+
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
+
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type) &&
+	    !c->opts.no_data_io)
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_opt ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
+
+	return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+			     struct bio **_dst)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	void *ec_buf;
+	unsigned total_output = 0, total_input = 0;
+	bool bounce = false;
+	bool page_alloc_failed = false;
+	int ret, more = 0;
+
+	BUG_ON(!bio_sectors(src));
+
+	ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		/* XXX look for bug here */
+		if (ec_buf) {
+			dst = bch2_write_bio_alloc(c, wp, src,
+						   &page_alloc_failed,
+						   ec_buf);
+			bio_copy_data(dst, src);
+			bounce = true;
+		}
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
+
+	if (ec_buf ||
+	    op->compression_opt ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src,
+					   &page_alloc_failed,
+					   ec_buf);
+		bounce = true;
+	}
+
+	saved_iter = dst->bi_iter;
+
+	do {
+		struct bch_extent_crc_unpacked crc = { 0 };
+		struct bversion version = op->version;
+		size_t dst_len = 0, src_len = 0;
+
+		if (page_alloc_failed &&
+		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+			break;
+
+		BUG_ON(op->compression_opt &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_opt && !bounce);
+
+		crc.compression_type = op->incompressible
+			? BCH_COMPRESSION_TYPE_incompressible
+			: op->compression_opt
+			? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					    op->compression_opt)
+			: 0;
+		if (!crc_is_compressed(crc)) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->opts.encoded_extent_max);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
+
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version);
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc_is_compressed(crc) &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			u8 compression_type = crc.compression_type;
+			u16 nonce = crc.nonce;
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+			/*
+			 * rchecksum_bio sets compression_type on crc from op->crc,
+			 * this isn't always correct as sometimes we're changing
+			 * an extent from uncompressed to incompressible.
+			 */
+			crc.compression_type = compression_type;
+			crc.nonce = nonce;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			ret = bch2_encrypt_bio(c, op->csum_type,
+					       extent_nonce(version, crc), dst);
+			if (ret)
+				goto err;
+
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output	+= dst_len;
+		total_input	+= src_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (dst == src && more) {
+		BUG_ON(total_output != total_input);
+
+		dst = bio_split(src, total_input >> 9,
+				GFP_NOFS, &c->bio_write);
+		wbio_init(dst)->put_bio	= true;
+		/* copy WRITE_SYNC flag */
+		dst->bi_opf		= src->bi_opf;
+	}
+
+	dst->bi_iter.bi_size = total_output;
+do_write:
+	*_dst = dst;
+	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (to_wbio(dst)->bounce)
+		bch2_bio_free_pages_pool(c, dst);
+	if (to_wbio(dst)->put_bio)
+		bio_put(dst);
+
+	return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+				     struct bkey_s_c k)
+{
+	struct bch_fs *c = op->c;
+	struct bkey_s_c_extent e;
+	struct extent_ptr_decoded p;
+	const union bch_extent_entry *entry;
+	unsigned replicas = 0;
+
+	if (k.k->type != KEY_TYPE_extent)
+		return false;
+
+	e = bkey_s_c_to_extent(k);
+	extent_for_each_ptr_decode(e, p, entry) {
+		if (crc_is_encoded(p.crc) || p.has_ec)
+			return false;
+
+		replicas += bch2_extent_ptr_durability(c, &p);
+	}
+
+	return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	const struct bch_extent_ptr *ptr;
+	struct bkey_i *k;
+
+	for_each_keylist_key(&op->insert_keys, k) {
+		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+		bkey_for_each_ptr(ptrs, ptr)
+			bch2_bucket_nocow_unlock(&c->nocow_locks,
+					       PTR_BUCKET_POS(c, ptr),
+					       BUCKET_NOCOW_LOCK_UPDATE);
+	}
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+						  struct btree_iter *iter,
+						  struct bkey_i *orig,
+						  struct bkey_s_c k,
+						  u64 new_i_size)
+{
+	struct bkey_i *new;
+	struct bkey_ptrs ptrs;
+	struct bch_extent_ptr *ptr;
+	int ret;
+
+	if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+		/* trace this */
+		return 0;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		return ret;
+
+	bch2_cut_front(bkey_start_pos(&orig->k), new);
+	bch2_cut_back(orig->k.p, new);
+
+	ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+	bkey_for_each_ptr(ptrs, ptr)
+		ptr->unwritten = 0;
+
+	/*
+	 * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+	 * that was done when we kicked off the write, and here it's important
+	 * that we update the extent that we wrote to - even if a snapshot has
+	 * since been created. The write is still outstanding, so we're ok
+	 * w.r.t. snapshot atomicity:
+	 */
+	return  bch2_extent_update_i_size_sectors(trans, iter,
+					min(new->k.p.offset << 9, new_i_size), 0) ?:
+		bch2_trans_update(trans, iter, new,
+				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i *orig;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_keylist_key(&op->insert_keys, orig) {
+		ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+				     bkey_start_pos(&orig->k), orig->k.p,
+				     BTREE_ITER_INTENT, k,
+				     NULL, NULL, BTREE_INSERT_NOFAIL, ({
+			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
+		}));
+
+		if (ret && !bch2_err_matches(ret, EROFS)) {
+			struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+			bch_err_inum_offset_ratelimited(c,
+				insert->k.p.inode, insert->k.p.offset << 9,
+				"write error while doing btree update: %s",
+				bch2_err_str(ret));
+		}
+
+		if (ret) {
+			op->error = ret;
+			break;
+		}
+	}
+
+	bch2_trans_put(trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+	bch2_nocow_write_unlock(op);
+
+	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+		op->error = -EIO;
+	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+		bch2_nocow_write_convert_unwritten(op);
+}
+
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
+{
+	closure_type(op, struct bch_write_op, cl);
+
+	__bch2_nocow_write_done(op);
+	bch2_write_done(cl);
+}
+
+struct bucket_to_lock {
+	struct bpos		b;
+	unsigned		gen;
+	struct nocow_lock_bucket *l;
+};
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_ptrs_c ptrs;
+	const struct bch_extent_ptr *ptr;
+	DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
+	struct bucket_to_lock *i;
+	u32 snapshot;
+	struct bucket_to_lock *stale_at;
+	int ret;
+
+	if (op->flags & BCH_WRITE_MOVE)
+		return;
+
+	darray_init(&buckets);
+	trans = bch2_trans_get(c);
+retry:
+	bch2_trans_begin(trans);
+
+	ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
+	if (unlikely(ret))
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(op->pos.inode, op->pos.offset, snapshot),
+			     BTREE_ITER_SLOTS);
+	while (1) {
+		struct bio *bio = &op->wbio.bio;
+
+		buckets.nr = 0;
+
+		k = bch2_btree_iter_peek_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		/* fall back to normal cow write path? */
+		if (unlikely(k.k->p.snapshot != snapshot ||
+			     !bch2_extent_is_writeable(op, k)))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					 op->inline_keys,
+					 ARRAY_SIZE(op->inline_keys),
+					 k.k->u64s))
+			break;
+
+		/* Get iorefs before dropping btree locks: */
+		ptrs = bch2_bkey_ptrs_c(k);
+		bkey_for_each_ptr(ptrs, ptr) {
+			struct bpos b = PTR_BUCKET_POS(c, ptr);
+			struct nocow_lock_bucket *l =
+				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));
+			prefetch(l);
+
+			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+				goto err_get_ioref;
+
+			/* XXX allocating memory with btree locks held - rare */
+			darray_push_gfp(&buckets, ((struct bucket_to_lock) {
+						   .b = b, .gen = ptr->gen, .l = l,
+						   }), GFP_KERNEL|__GFP_NOFAIL);
+
+			if (ptr->unwritten)
+				op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+		}
+
+		/* Unlock before taking nocow locks, doing IO: */
+		bkey_reassemble(op->insert_keys.top, k);
+		bch2_trans_unlock(trans);
+
+		bch2_cut_front(op->pos, op->insert_keys.top);
+		if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+		darray_for_each(buckets, i) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode);
+
+			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,
+						 bucket_to_u64(i->b),
+						 BUCKET_NOCOW_LOCK_UPDATE);
+
+			rcu_read_lock();
+			bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
+			rcu_read_unlock();
+
+			if (unlikely(stale)) {
+				stale_at = i;
+				goto err_bucket_stale;
+			}
+		}
+
+		bio = &op->wbio.bio;
+		if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+			bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+					GFP_KERNEL, &c->bio_write);
+			wbio_init(bio)->put_bio = true;
+			bio->bi_opf = op->wbio.bio.bi_opf;
+		} else {
+			op->flags |= BCH_WRITE_DONE;
+		}
+
+		op->pos.offset += bio_sectors(bio);
+		op->written += bio_sectors(bio);
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+		closure_get(&op->cl);
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  op->insert_keys.top, true);
+
+		bch2_keylist_push(&op->insert_keys);
+		if (op->flags & BCH_WRITE_DONE)
+			break;
+		bch2_btree_iter_advance(&iter);
+	}
+out:
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	if (ret) {
+		bch_err_inum_offset_ratelimited(c,
+			op->pos.inode, op->pos.offset << 9,
+			"%s: btree lookup error %s", __func__, bch2_err_str(ret));
+		op->error = ret;
+		op->flags |= BCH_WRITE_DONE;
+	}
+
+	bch2_trans_put(trans);
+	darray_exit(&buckets);
+
+	/* fallback to cow write path? */
+	if (!(op->flags & BCH_WRITE_DONE)) {
+		closure_sync(&op->cl);
+		__bch2_nocow_write_done(op);
+		op->insert_keys.top = op->insert_keys.keys;
+	} else if (op->flags & BCH_WRITE_SYNC) {
+		closure_sync(&op->cl);
+		bch2_nocow_write_done(&op->cl.work);
+	} else {
+		/*
+		 * XXX
+		 * needs to run out of process context because ei_quota_lock is
+		 * a mutex
+		 */
+		continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+	}
+	return;
+err_get_ioref:
+	darray_for_each(buckets, i)
+		percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref);
+
+	/* Fall back to COW path: */
+	goto out;
+err_bucket_stale:
+	darray_for_each(buckets, i) {
+		bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE);
+		if (i == stale_at)
+			break;
+	}
+
+	/* We can retry this: */
+	ret = -BCH_ERR_transaction_restart;
+	goto err_get_ioref;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct write_point *wp = NULL;
+	struct bio *bio = NULL;
+	unsigned nofs_flags;
+	int ret;
+
+	nofs_flags = memalloc_nofs_save();
+
+	if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+		bch2_nocow_write(op);
+		if (op->flags & BCH_WRITE_DONE)
+			goto out_nofs_restore;
+	}
+again:
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	do {
+		struct bkey_i *key_to_write;
+		unsigned key_to_write_offset = op->insert_keys.top_p -
+			op->insert_keys.keys_p;
+
+		/* +1 for possible cache device: */
+		if (op->open_buckets.nr + op->nr_replicas + 1 >
+		    ARRAY_SIZE(op->open_buckets.v))
+			break;
+
+		if (bch2_keylist_realloc(&op->insert_keys,
+					op->inline_keys,
+					ARRAY_SIZE(op->inline_keys),
+					BKEY_EXTENT_U64s_MAX))
+			break;
+
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
+		ret = bch2_trans_do(c, NULL, NULL, 0,
+			bch2_alloc_sectors_start_trans(trans,
+				op->target,
+				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+				op->write_point,
+				&op->devs_have,
+				op->nr_replicas,
+				op->nr_replicas_required,
+				op->watermark,
+				op->flags,
+				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
+				? NULL : &op->cl, &wp));
+		if (unlikely(ret)) {
+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+				break;
+
+			goto err;
+		}
+
+		EBUG_ON(!wp);
+
+		bch2_open_bucket_get(c, wp, &op->open_buckets);
+		ret = bch2_write_extent(op, wp, &bio);
+
+		bch2_alloc_sectors_done_inlined(c, wp);
+err:
+		if (ret <= 0) {
+			op->flags |= BCH_WRITE_DONE;
+
+			if (ret < 0) {
+				op->error = ret;
+				break;
+			}
+		}
+
+		bio->bi_end_io	= bch2_write_endio;
+		bio->bi_private	= &op->cl;
+		bio->bi_opf |= REQ_OP_WRITE;
+
+		closure_get(bio->bi_private);
+
+		key_to_write = (void *) (op->insert_keys.keys_p +
+					 key_to_write_offset);
+
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+					  key_to_write, false);
+	} while (ret);
+
+	/*
+	 * Sync or no?
+	 *
+	 * If we're running asynchronously, wne may still want to block
+	 * synchronously here if we weren't able to submit all of the IO at
+	 * once, as that signals backpressure to the caller.
+	 */
+	if ((op->flags & BCH_WRITE_SYNC) ||
+	    (!(op->flags & BCH_WRITE_DONE) &&
+	     !(op->flags & BCH_WRITE_IN_WORKER))) {
+		closure_sync(&op->cl);
+		__bch2_write_index(op);
+
+		if (!(op->flags & BCH_WRITE_DONE))
+			goto again;
+		bch2_write_done(&op->cl);
+	} else {
+		bch2_write_queue(op, wp);
+		continue_at(&op->cl, bch2_write_index, NULL);
+	}
+out_nofs_restore:
+	memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bvec_iter iter;
+	struct bkey_i_inline_data *id;
+	unsigned sectors;
+	int ret;
+
+	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+	op->flags |= BCH_WRITE_DONE;
+
+	bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+	ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+				   ARRAY_SIZE(op->inline_keys),
+				   BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+	if (ret) {
+		op->error = ret;
+		goto err;
+	}
+
+	sectors = bio_sectors(bio);
+	op->pos.offset += sectors;
+
+	id = bkey_inline_data_init(op->insert_keys.top);
+	id->k.p		= op->pos;
+	id->k.version	= op->version;
+	id->k.size	= sectors;
+
+	iter = bio->bi_iter;
+	iter.bi_size = data_len;
+	memcpy_from_bio(id->v.data, bio, iter);
+
+	while (data_len & 7)
+		id->v.data[data_len++] = '\0';
+	set_bkey_val_bytes(&id->k, data_len);
+	bch2_keylist_push(&op->insert_keys);
+
+	__bch2_write_index(op);
+err:
+	bch2_write_done(&op->cl);
+}
+
+/**
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:		&bch_write_op->cl
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+CLOSURE_CALLBACK(bch2_write)
+{
+	closure_type(op, struct bch_write_op, cl);
+	struct bio *bio = &op->wbio.bio;
+	struct bch_fs *c = op->c;
+	unsigned data_len;
+
+	EBUG_ON(op->cl.parent);
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+	op->start_time = local_clock();
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(bio)->put_bio = false;
+
+	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
+		bch_err_inum_offset_ratelimited(c,
+			op->pos.inode,
+			op->pos.offset << 9,
+			"misaligned write");
+		op->error = -EIO;
+		goto err;
+	}
+
+	if (c->opts.nochanges) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	if (!(op->flags & BCH_WRITE_MOVE) &&
+	    !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+		op->error = -BCH_ERR_erofs_no_writes;
+		goto err;
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+	data_len = min_t(u64, bio->bi_iter.bi_size,
+			 op->new_i_size - (op->pos.offset << 9));
+
+	if (c->opts.inline_data &&
+	    data_len <= min(block_bytes(c) / 2, 1024U)) {
+		bch2_write_data_inline(op, data_len);
+		return;
+	}
+
+	__bch2_write(op);
+	return;
+err:
+	bch2_disk_reservation_put(c, &op->res);
+
+	closure_debug_destroy(&op->cl);
+	if (op->end_io)
+		op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f)	#f,
+	BCH_WRITE_FLAGS()
+#undef x
+	NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+	prt_str(out, "pos: ");
+	bch2_bpos_to_text(out, op->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "started: ");
+	bch2_pr_time_units(out, local_clock() - op->start_time);
+	prt_newline(out);
+
+	prt_str(out, "flags: ");
+	prt_bitflags(out, bch2_write_flags, op->flags);
+	prt_newline(out);
+
+	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS))
+		return -BCH_ERR_ENOMEM_bio_write_init;
+
+	if (mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->opts.encoded_extent_max) /
+				   PAGE_SIZE, 0))
+		return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+	return 0;
+}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
new file mode 100644
index 000000000000..6c276a48f95d
--- /dev/null
+++ b/fs/bcachefs/io_write.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()		\
+	x(ALLOC_NOWAIT)			\
+	x(CACHED)			\
+	x(DATA_ENCODED)			\
+	x(PAGES_STABLE)			\
+	x(PAGES_OWNED)			\
+	x(ONLY_SPECIFIED_DEVS)		\
+	x(WROTE_DATA_INLINE)		\
+	x(FROM_INTERNAL)		\
+	x(CHECK_ENOSPC)			\
+	x(SYNC)				\
+	x(MOVE)				\
+	x(IN_WORKER)			\
+	x(DONE)				\
+	x(IO_ERROR)			\
+	x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)	__BCH_WRITE_##f,
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+	BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->watermark == BCH_WATERMARK_copygc
+		? op->c->copygc_wq
+		: op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+			       struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+		       struct btree_iter *, struct bkey_i *,
+		       struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->end_io		= NULL;
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts);
+	op->compression_opt	= opts.compression;
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->watermark		= BCH_WATERMARK_normal;
+	op->incompressible	= 0;
+	op->open_buckets.nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->subvol		= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->new_i_size		= U64_MAX;
+	op->i_sectors_delta	= 0;
+	op->devs_need_flush	= NULL;
+}
+
+CLOSURE_CALLBACK(bch2_write);
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
new file mode 100644
index 000000000000..c7f97c2c4805
--- /dev/null
+++ b/fs/bcachefs/io_write_types.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+	u64			inode_offset;
+
+	struct bch_devs_list	failed;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				nocow:1,
+				used_mempool:1,
+				first_btree_write:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	void			(*end_io)(struct bch_write_op *);
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		compression_opt:8;
+	unsigned		csum_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		watermark:3;
+	unsigned		incompressible:1;
+	unsigned		stripe_waited:1;
+
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+	struct bch_io_opts	opts;
+
+	u32			subvol;
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct write_point	*wp;
+	struct list_head	wp_list;
+
+	struct disk_reservation	res;
+
+	struct open_buckets	open_buckets;
+
+	u64			new_i_size;
+	s64			i_sectors_delta;
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/*
+	 * Bitmask of devices that have had nocow writes issued to them since
+	 * last flush:
+	 */
+	struct bch_devs_mask	*devs_need_flush;
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
new file mode 100644
index 000000000000..8cf238be6213
--- /dev/null
+++ b/fs/bcachefs/journal.c
@@ -0,0 +1,1439 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_methods.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "trace.h"
+
+static const char * const bch2_journal_errors[] = {
+#define x(n)	#n,
+	JOURNAL_ERRORS()
+#undef x
+	NULL
+};
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf = NULL;
+
+	EBUG_ON(seq > journal_cur_seq(j));
+
+	if (journal_seq_unwritten(j, seq)) {
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+	}
+	return buf;
+}
+
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(p->list); i++)
+		INIT_LIST_HEAD(&p->list[i]);
+	INIT_LIST_HEAD(&p->flushed);
+	atomic_set(&p->count, count);
+	p->devs.nr = 0;
+}
+
+/*
+ * Detect stuck journal conditions and trigger shutdown. Technically the journal
+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal
+ * reservation lockup, etc. Since this is a fatal error with potentially
+ * unpredictable characteristics, we want to be fairly conservative before we
+ * decide to shut things down.
+ *
+ * Consider the journal stuck when it appears full with no ability to commit
+ * btree transactions, to discard journal buckets, nor acquire priority
+ * (reserved watermark) reservation.
+ */
+static inline bool
+journal_error_check_stuck(struct journal *j, int error, unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool stuck = false;
+	struct printbuf buf = PRINTBUF;
+
+	if (!(error == JOURNAL_ERR_journal_full ||
+	      error == JOURNAL_ERR_journal_pin_full) ||
+	    nr_unwritten_journal_entries(j) ||
+	    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
+		return stuck;
+
+	spin_lock(&j->lock);
+
+	if (j->can_discard) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+
+	stuck = true;
+
+	/*
+	 * The journal shutdown path will set ->err_seq, but do it here first to
+	 * serialize against concurrent failures and avoid duplicate error
+	 * reports.
+	 */
+	if (j->err_seq) {
+		spin_unlock(&j->lock);
+		return stuck;
+	}
+	j->err_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
+		bch2_journal_errors[error]);
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "%s", buf.buf);
+
+	printbuf_reset(&buf);
+	bch2_journal_pins_to_text(&buf, j);
+	bch_err(c, "Journal pins:\n%s", buf.buf);
+	printbuf_exit(&buf);
+
+	bch2_fatal_error(c);
+	dump_stack();
+
+	return stuck;
+}
+
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+	lockdep_assert_held(&j->lock);
+
+	if (__bch2_journal_pin_put(j, seq))
+		bch2_journal_reclaim_fast(j);
+	if (write)
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
+ */
+static void __journal_entry_close(struct journal *j, unsigned closed_val)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = journal_cur_buf(j);
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+	unsigned sectors;
+
+	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
+	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
+
+	lockdep_assert_held(&j->lock);
+
+	do {
+		old.v = new.v = v;
+		new.cur_entry_offset = closed_val;
+
+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
+		    old.cur_entry_offset == new.cur_entry_offset)
+			return;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (!__journal_entry_is_open(old))
+		return;
+
+	/* Close out old buffer: */
+	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
+
+	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+				      buf->u64s_reserved) << c->block_bits;
+	BUG_ON(sectors > buf->sectors);
+	buf->sectors = sectors;
+
+	/*
+	 * We have to set last_seq here, _before_ opening a new journal entry:
+	 *
+	 * A threads may replace an old pin with a new pin on their current
+	 * journal reservation - the expectation being that the journal will
+	 * contain either what the old pin protected or what the new pin
+	 * protects.
+	 *
+	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * pin, so we can only write the updated last_seq on the entry that
+	 * contains whatever the new pin protects.
+	 *
+	 * Restated, we can _not_ update last_seq for a given entry if there
+	 * could be a newer entry open with reservations/pins that have been
+	 * taken against it.
+	 *
+	 * Hence, we want update/set last_seq on the current journal entry right
+	 * before we open a new one:
+	 */
+	buf->last_seq		= journal_last_seq(j);
+	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
+	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
+
+	cancel_delayed_work(&j->write_work);
+
+	bch2_journal_space_available(j);
+
+	__bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+	spin_lock(&j->lock);
+	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+	if (!j->err_seq)
+		j->err_seq = journal_cur_seq(j);
+	journal_wake(j);
+	spin_unlock(&j->lock);
+}
+
+static bool journal_entry_want_write(struct journal *j)
+{
+	bool ret = !journal_entry_is_open(j) ||
+		journal_cur_seq(j) == journal_last_unwritten_seq(j);
+
+	/* Don't close it yet if we already have a write in flight: */
+	if (ret)
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	else if (nr_unwritten_journal_entries(j)) {
+		struct journal_buf *buf = journal_cur_buf(j);
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+	}
+
+	return ret;
+}
+
+bool bch2_journal_entry_close(struct journal *j)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = journal_entry_want_write(j);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * should _only_ called from journal_res_get() - when we actually want a
+ * journal reservation - journal entry is open means journal is dirty:
+ */
+static int journal_entry_open(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf = j->buf +
+		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
+	union journal_res_state old, new;
+	int u64s;
+	u64 v;
+
+	lockdep_assert_held(&j->lock);
+	BUG_ON(journal_entry_is_open(j));
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+	if (j->blocked)
+		return JOURNAL_ERR_blocked;
+
+	if (j->cur_entry_error)
+		return j->cur_entry_error;
+
+	if (bch2_journal_error(j))
+		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+
+	if (!fifo_free(&j->pin))
+		return JOURNAL_ERR_journal_pin_full;
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
+		return JOURNAL_ERR_max_in_flight;
+
+	BUG_ON(!j->cur_entry_sectors);
+
+	buf->expires		=
+		(journal_cur_seq(j) == j->flushed_seq_ondisk
+		 ? jiffies
+		 : j->last_flush_write) +
+		msecs_to_jiffies(c->opts.journal_flush_delay);
+
+	buf->u64s_reserved	= j->entry_u64s_reserved;
+	buf->disk_sectors	= j->cur_entry_sectors;
+	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
+
+	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+		journal_entry_overhead(j);
+	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
+
+	if (u64s <= (ssize_t) j->early_journal_entries.nr)
+		return JOURNAL_ERR_journal_full;
+
+	if (fifo_empty(&j->pin) && j->reclaim_thread)
+		wake_up_process(j->reclaim_thread);
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for journal_last_seq() to be calculated correctly
+	 */
+	atomic64_inc(&j->seq);
+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
+
+	bkey_extent_init(&buf->key);
+	buf->noflush	= false;
+	buf->must_flush	= false;
+	buf->separate_flush = false;
+	buf->flush_time	= 0;
+
+	memset(buf->data, 0, sizeof(*buf->data));
+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
+	buf->data->u64s	= 0;
+
+	if (j->early_journal_entries.nr) {
+		memcpy(buf->data->_data, j->early_journal_entries.data,
+		       j->early_journal_entries.nr * sizeof(u64));
+		le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr);
+	}
+
+	/*
+	 * Must be set before marking the journal entry as open:
+	 */
+	j->cur_entry_u64s = u64s;
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+
+		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
+
+		new.idx++;
+		BUG_ON(journal_state_count(new, new.idx));
+		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+
+		journal_state_inc(&new);
+
+		/* Handle any already added entries */
+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	if (j->res_get_blocked_start)
+		bch2_time_stats_update(j->blocked_time,
+				       j->res_get_blocked_start);
+	j->res_get_blocked_start = 0;
+
+	mod_delayed_work(c->io_complete_wq,
+			 &j->write_work,
+			 msecs_to_jiffies(c->opts.journal_flush_delay));
+	journal_wake(j);
+
+	if (j->early_journal_entries.nr)
+		darray_exit(&j->early_journal_entries);
+	return 0;
+}
+
+static bool journal_quiesced(struct journal *j)
+{
+	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
+
+	if (!ret)
+		bch2_journal_entry_close(j);
+	return ret;
+}
+
+static void journal_quiesce(struct journal *j)
+{
+	wait_event(j->wait, journal_quiesced(j));
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(work, struct journal, write_work.work);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	long delta;
+
+	spin_lock(&j->lock);
+	if (!__journal_entry_is_open(j->reservations))
+		goto unlock;
+
+	delta = journal_cur_buf(j)->expires - jiffies;
+
+	if (delta > 0)
+		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
+	else
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+unlock:
+	spin_unlock(&j->lock);
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			     unsigned flags)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	bool can_discard;
+	int ret;
+retry:
+	if (journal_res_get_fast(j, res, flags))
+		return 0;
+
+	if (bch2_journal_error(j))
+		return -BCH_ERR_erofs_journal_err;
+
+	spin_lock(&j->lock);
+
+	/* check once more in case somebody else shut things down... */
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return -BCH_ERR_erofs_journal_err;
+	}
+
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call bch2_journal_entry_close()
+	 * unnecessarily
+	 */
+	if (journal_res_get_fast(j, res, flags)) {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		/*
+		 * Don't want to close current journal entry, just need to
+		 * invoke reclaim:
+		 */
+		ret = JOURNAL_ERR_journal_full;
+		goto unlock;
+	}
+
+	/*
+	 * If we couldn't get a reservation because the current buf filled up,
+	 * and we had room for a bigger entry on disk, signal that we want to
+	 * realloc the journal bufs:
+	 */
+	buf = journal_cur_buf(j);
+	if (journal_entry_is_open(j) &&
+	    buf->buf_size >> 9 < buf->disk_sectors &&
+	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
+
+	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	ret = journal_entry_open(j);
+
+	if (ret == JOURNAL_ERR_max_in_flight)
+		trace_and_count(c, journal_entry_full, c);
+unlock:
+	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
+	    !j->res_get_blocked_start) {
+		j->res_get_blocked_start = local_clock() ?: 1;
+		trace_and_count(c, journal_full, c);
+	}
+
+	can_discard = j->can_discard;
+	spin_unlock(&j->lock);
+
+	if (!ret)
+		goto retry;
+	if (journal_error_check_stuck(j, ret, flags))
+		ret = -BCH_ERR_journal_res_get_blocked;
+
+	/*
+	 * Journal is full - can't rely on reclaim from work item due to
+	 * freezing:
+	 */
+	if ((ret == JOURNAL_ERR_journal_full ||
+	     ret == JOURNAL_ERR_journal_pin_full) &&
+	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+		if (can_discard) {
+			bch2_journal_do_discards(j);
+			goto retry;
+		}
+
+		if (mutex_trylock(&j->reclaim_lock)) {
+			bch2_journal_reclaim(j);
+			mutex_unlock(&j->reclaim_lock);
+		}
+	}
+
+	return ret == JOURNAL_ERR_insufficient_devices
+		? -BCH_ERR_erofs_journal_err
+		: -BCH_ERR_journal_res_get_blocked;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				  unsigned flags)
+{
+	int ret;
+
+	closure_wait_event(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *j,
+				   struct journal_entry_res *res,
+				   unsigned new_u64s)
+{
+	union journal_res_state state;
+	int d = new_u64s - res->u64s;
+
+	spin_lock(&j->lock);
+
+	j->entry_u64s_reserved += d;
+	if (d <= 0)
+		goto out;
+
+	j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
+	smp_mb();
+	state = READ_ONCE(j->reservations);
+
+	if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
+	    state.cur_entry_offset > j->cur_entry_u64s) {
+		j->cur_entry_u64s += d;
+		/*
+		 * Not enough room in current journal entry, have to flush it:
+		 */
+		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+	} else {
+		journal_cur_buf(j)->u64s_reserved += d;
+	}
+out:
+	spin_unlock(&j->lock);
+	res->u64s += d;
+}
+
+/* journal flushing: */
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:		journal object
+ * @seq:	seq to flush
+ * @parent:	closure object to wait with
+ * Returns:	1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *		-EIO if @seq will never be flushed
+ *
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+				 struct closure *parent)
+{
+	struct journal_buf *buf;
+	int ret = 0;
+
+	if (seq <= j->flushed_seq_ondisk)
+		return 1;
+
+	spin_lock(&j->lock);
+
+	if (WARN_ONCE(seq > journal_cur_seq(j),
+		      "requested to flush journal seq %llu, but currently at %llu",
+		      seq, journal_cur_seq(j)))
+		goto out;
+
+	/* Recheck under lock: */
+	if (j->err_seq && seq >= j->err_seq) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (seq <= j->flushed_seq_ondisk) {
+		ret = 1;
+		goto out;
+	}
+
+	/* if seq was written, but not flushed - flush a newer one instead */
+	seq = max(seq, journal_last_unwritten_seq(j));
+
+recheck_need_open:
+	if (seq > journal_cur_seq(j)) {
+		struct journal_res res = { 0 };
+
+		if (journal_entry_is_open(j))
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+		spin_unlock(&j->lock);
+
+		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+		if (ret)
+			return ret;
+
+		seq = res.seq;
+		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf->must_flush = true;
+
+		if (!buf->flush_time) {
+			buf->flush_time	= local_clock() ?: 1;
+			buf->expires = jiffies;
+		}
+
+		if (parent && !closure_wait(&buf->wait, parent))
+			BUG();
+
+		bch2_journal_res_put(j, &res);
+
+		spin_lock(&j->lock);
+		goto want_write;
+	}
+
+	/*
+	 * if write was kicked off without a flush, flush the next sequence
+	 * number instead
+	 */
+	buf = journal_seq_to_buf(j, seq);
+	if (buf->noflush) {
+		seq++;
+		goto recheck_need_open;
+	}
+
+	buf->must_flush = true;
+
+	if (parent && !closure_wait(&buf->wait, parent))
+		BUG();
+want_write:
+	if (seq == journal_cur_seq(j))
+		journal_entry_want_write(j);
+out:
+	spin_unlock(&j->lock);
+	return ret;
+}
+
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+	u64 start_time = local_clock();
+	int ret, ret2;
+
+	/*
+	 * Don't update time_stats when @seq is already flushed:
+	 */
+	if (seq <= j->flushed_seq_ondisk)
+		return 0;
+
+	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+
+	if (!ret)
+		bch2_time_stats_update(j->flush_seq_time, start_time);
+
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+}
+
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	u64 unwritten_seq;
+	bool ret = false;
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+		return false;
+
+	if (seq <= c->journal.flushed_seq_ondisk)
+		return false;
+
+	spin_lock(&j->lock);
+	if (seq <= c->journal.flushed_seq_ondisk)
+		goto out;
+
+	for (unwritten_seq = journal_last_unwritten_seq(j);
+	     unwritten_seq < seq;
+	     unwritten_seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+		/* journal write is already in flight, and was a flush write: */
+		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+			goto out;
+
+		buf->noflush = true;
+	}
+
+	ret = true;
+out:
+	spin_unlock(&j->lock);
+	return ret;
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_buf *buf;
+	struct journal_res res;
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+	if (ret)
+		return ret;
+
+	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+	buf->must_flush = true;
+
+	if (!buf->flush_time) {
+		buf->flush_time	= local_clock() ?: 1;
+		buf->expires = jiffies;
+	}
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked--;
+	spin_unlock(&j->lock);
+
+	journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked++;
+	spin_unlock(&j->lock);
+
+	journal_quiesce(j);
+}
+
+/* allocate journal on a device: */
+
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+					 bool new_fs, struct closure *cl)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+	struct open_bucket **ob = NULL;
+	long *bu = NULL;
+	unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
+	int ret = 0;
+
+	BUG_ON(nr <= ja->nr);
+
+	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
+	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
+	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
+	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err_free;
+	}
+
+	for (nr_got = 0; nr_got < nr_want; nr_got++) {
+		if (new_fs) {
+			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+			if (bu[nr_got] < 0) {
+				ret = -BCH_ERR_ENOSPC_bucket_alloc;
+				break;
+			}
+		} else {
+			ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl);
+			ret = PTR_ERR_OR_ZERO(ob[nr_got]);
+			if (ret)
+				break;
+
+			ret = bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						ob[nr_got]->bucket, BCH_DATA_journal,
+						ca->mi.bucket_size));
+			if (ret) {
+				bch2_open_bucket_put(c, ob[nr_got]);
+				bch_err_msg(c, ret, "marking new journal buckets");
+				break;
+			}
+
+			bu[nr_got] = ob[nr_got]->bucket;
+		}
+	}
+
+	if (!nr_got)
+		goto err_free;
+
+	/* Don't return an error if we successfully allocated some buckets: */
+	ret = 0;
+
+	if (c) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_block(&c->journal);
+		mutex_lock(&c->sb_lock);
+	}
+
+	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
+	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
+
+	BUG_ON(ja->discard_idx > ja->nr);
+
+	pos = ja->discard_idx ?: ja->nr;
+
+	memmove(new_buckets + pos + nr_got,
+		new_buckets + pos,
+		sizeof(new_buckets[0]) * (ja->nr - pos));
+	memmove(new_bucket_seq + pos + nr_got,
+		new_bucket_seq + pos,
+		sizeof(new_bucket_seq[0]) * (ja->nr - pos));
+
+	for (i = 0; i < nr_got; i++) {
+		new_buckets[pos + i] = bu[i];
+		new_bucket_seq[pos + i] = 0;
+	}
+
+	nr = ja->nr + nr_got;
+
+	ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+	if (ret)
+		goto err_unblock;
+
+	if (!new_fs)
+		bch2_write_super(c);
+
+	/* Commit: */
+	if (c)
+		spin_lock(&c->journal.lock);
+
+	swap(new_buckets,	ja->buckets);
+	swap(new_bucket_seq,	ja->bucket_seq);
+	ja->nr = nr;
+
+	if (pos <= ja->discard_idx)
+		ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx_ondisk)
+		ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+	if (pos <= ja->dirty_idx)
+		ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+	if (pos <= ja->cur_idx)
+		ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
+
+	if (c)
+		spin_unlock(&c->journal.lock);
+err_unblock:
+	if (c) {
+		bch2_journal_unblock(&c->journal);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (ret && !new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_trans_run(c,
+				bch2_trans_mark_metadata_bucket(trans, ca,
+						bu[i], BCH_DATA_free, 0));
+err_free:
+	if (!new_fs)
+		for (i = 0; i < nr_got; i++)
+			bch2_open_bucket_put(c, ob[i]);
+
+	kfree(new_bucket_seq);
+	kfree(new_buckets);
+	kfree(ob);
+	kfree(bu);
+	return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	struct journal_device *ja = &ca->journal;
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	down_write(&c->state_lock);
+
+	/* don't handle reducing nr of buckets yet: */
+	if (nr < ja->nr)
+		goto unlock;
+
+	while (ja->nr < nr) {
+		struct disk_reservation disk_res = { 0, 0, 0 };
+
+		/*
+		 * note: journal buckets aren't really counted as _sectors_ used yet, so
+		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+		 * when space used goes up without a reservation - but we do need the
+		 * reservation to ensure we'll actually be able to allocate:
+		 *
+		 * XXX: that's not right, disk reservations only ensure a
+		 * filesystem-wide allocation will succeed, this is a device
+		 * specific allocation - we can hang here:
+		 */
+
+		ret = bch2_disk_reservation_get(c, &disk_res,
+						bucket_to_sector(ca, nr - ja->nr), 1, 0);
+		if (ret)
+			break;
+
+		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+		bch2_disk_reservation_put(c, &disk_res);
+
+		closure_sync(&cl);
+
+		if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
+			break;
+	}
+
+	if (ret)
+		bch_err_fn(c, ret);
+unlock:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
+{
+	unsigned nr;
+	int ret;
+
+	if (dynamic_fault("bcachefs:add:journal_alloc")) {
+		ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+		goto err;
+	}
+
+	/* 1/128th of the device by default: */
+	nr = ca->mi.nbuckets >> 7;
+
+	/*
+	 * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
+	 * is smaller:
+	 */
+	nr = clamp_t(unsigned, nr,
+		     BCH_JOURNAL_BUCKETS_MIN,
+		     min(1 << 13,
+			 (1 << 24) / ca->mi.bucket_size));
+
+	ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+err:
+	if (ret)
+		bch_err_fn(ca, ret);
+	return ret;
+}
+
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i) {
+		if (ca->journal.nr)
+			continue;
+
+		int ret = bch2_dev_journal_alloc(ca);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+/* startup/shutdown: */
+
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
+{
+	bool ret = false;
+	u64 seq;
+
+	spin_lock(&j->lock);
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j) && !ret;
+	     seq++) {
+		struct journal_buf *buf = journal_seq_to_buf(j, seq);
+
+		if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
+			ret = true;
+	}
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
+{
+	wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
+}
+
+void bch2_fs_journal_stop(struct journal *j)
+{
+	bch2_journal_reclaim_stop(j);
+	bch2_journal_flush_all_pins(j);
+
+	wait_event(j->wait, bch2_journal_entry_close(j));
+
+	/*
+	 * Always write a new journal entry, to make sure the clock hands are up
+	 * to date (and match the superblock)
+	 */
+	bch2_journal_meta(j);
+
+	journal_quiesce(j);
+
+	BUG_ON(!bch2_journal_error(j) &&
+	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+	       j->last_empty_seq != journal_cur_seq(j));
+
+	cancel_delayed_work_sync(&j->write_work);
+}
+
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct journal_replay *i, **_i;
+	struct genradix_iter iter;
+	bool had_entries = false;
+	unsigned ptr;
+	u64 last_seq = cur_seq, nr, seq;
+
+	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		last_seq = le64_to_cpu(i->j.last_seq);
+		break;
+	}
+
+	nr = cur_seq - last_seq;
+
+	if (nr + 1 > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+			return -BCH_ERR_ENOMEM_journal_pin_fifo;
+		}
+	}
+
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
+	j->last_seq_ondisk	= last_seq;
+	j->flushed_seq_ondisk	= cur_seq - 1;
+	j->seq_ondisk		= cur_seq - 1;
+	j->pin.front		= last_seq;
+	j->pin.back		= cur_seq;
+	atomic64_set(&j->seq, cur_seq - 1);
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq)
+		journal_pin_list_init(p, 1);
+
+	genradix_for_each(&c->journal_entries, iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		BUG_ON(seq >= cur_seq);
+
+		if (seq < last_seq)
+			continue;
+
+		if (journal_entry_empty(&i->j))
+			j->last_empty_seq = le64_to_cpu(i->j.seq);
+
+		p = journal_seq_pin(j, seq);
+
+		p->devs.nr = 0;
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+		had_entries = true;
+	}
+
+	if (!had_entries)
+		j->last_empty_seq = cur_seq;
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+	j->last_flush_write = jiffies;
+
+	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
+	j->reservations.unwritten_idx++;
+
+	c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	return bch2_journal_reclaim_start(j);
+}
+
+/* init/exit: */
+
+void bch2_dev_journal_exit(struct bch_dev *ca)
+{
+	kfree(ca->journal.bio);
+	kfree(ca->journal.buckets);
+	kfree(ca->journal.bucket_seq);
+
+	ca->journal.bio		= NULL;
+	ca->journal.buckets	= NULL;
+	ca->journal.bucket_seq	= NULL;
+}
+
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bch_sb_field_journal *journal_buckets =
+		bch2_sb_field_get(sb, journal);
+	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+		bch2_sb_field_get(sb, journal_v2);
+	unsigned i, nr_bvecs;
+
+	ja->nr = 0;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+		for (i = 0; i < nr; i++)
+			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+	} else if (journal_buckets) {
+		ja->nr = bch2_nr_journal_buckets(journal_buckets);
+	}
+
+	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->bucket_seq)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+
+	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+	if (!ca->journal.bio)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+
+	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+	if (!ja->buckets)
+		return -BCH_ERR_ENOMEM_dev_journal_init;
+
+	if (journal_buckets_v2) {
+		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+		unsigned j, dst = 0;
+
+		for (i = 0; i < nr; i++)
+			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+				ja->buckets[dst++] =
+					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+	} else if (journal_buckets) {
+		for (i = 0; i < ja->nr; i++)
+			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+	}
+
+	return 0;
+}
+
+void bch2_fs_journal_exit(struct journal *j)
+{
+	unsigned i;
+
+	darray_exit(&j->early_journal_entries);
+
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvpfree(j->buf[i].data, j->buf[i].buf_size);
+	free_fifo(&j->pin);
+}
+
+int bch2_fs_journal_init(struct journal *j)
+{
+	static struct lock_class_key res_key;
+	unsigned i;
+
+	spin_lock_init(&j->lock);
+	spin_lock_init(&j->err_lock);
+	init_waitqueue_head(&j->wait);
+	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+	init_waitqueue_head(&j->reclaim_wait);
+	init_waitqueue_head(&j->pin_flush_wait);
+	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->discard_lock);
+
+	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+	atomic64_set(&j->reservations.counter,
+		((union journal_res_state)
+		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
+		return -BCH_ERR_ENOMEM_journal_pin_fifo;
+
+	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		if (!j->buf[i].data)
+			return -BCH_ERR_ENOMEM_journal_buf;
+	}
+
+	j->pin.front = j->pin.back = 1;
+	return 0;
+}
+
+/* debug: */
+
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	union journal_res_state s;
+	struct bch_dev *ca;
+	unsigned long now = jiffies;
+	u64 seq;
+	unsigned i;
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 24);
+	out->atomic++;
+
+	rcu_read_lock();
+	s = READ_ONCE(j->reservations);
+
+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
+	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
+	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
+	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
+	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
+	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
+	prt_printf(out, "watermark:\t\t%s\n",		bch2_watermarks[j->watermark]);
+	prt_printf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
+	prt_printf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
+	prt_printf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
+	prt_printf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
+	prt_printf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
+	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
+	prt_printf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
+	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	prt_printf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
+	prt_printf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
+	prt_printf(out, "current entry:\t\t");
+
+	switch (s.cur_entry_offset) {
+	case JOURNAL_ENTRY_ERROR_VAL:
+		prt_printf(out, "error");
+		break;
+	case JOURNAL_ENTRY_CLOSED_VAL:
+		prt_printf(out, "closed");
+		break;
+	default:
+		prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
+		break;
+	}
+
+	prt_newline(out);
+
+	for (seq = journal_cur_seq(j);
+	     seq >= journal_last_unwritten_seq(j);
+	     --seq) {
+		i = seq & JOURNAL_BUF_MASK;
+
+		prt_printf(out, "unwritten entry:");
+		prt_tab(out);
+		prt_printf(out, "%llu", seq);
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
+
+		prt_printf(out, "refcount:");
+		prt_tab(out);
+		prt_printf(out, "%u", journal_state_count(s, i));
+		prt_newline(out);
+
+		prt_printf(out, "sectors:");
+		prt_tab(out);
+		prt_printf(out, "%u", j->buf[i].sectors);
+		prt_newline(out);
+
+		prt_printf(out, "expires");
+		prt_tab(out);
+		prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
+		prt_newline(out);
+
+		printbuf_indent_sub(out, 2);
+	}
+
+	prt_printf(out,
+	       "replay done:\t\t%i\n",
+	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
+
+	prt_printf(out, "space:\n");
+	prt_printf(out, "\tdiscarded\t%u:%u\n",
+	       j->space[journal_space_discarded].next_entry,
+	       j->space[journal_space_discarded].total);
+	prt_printf(out, "\tclean ondisk\t%u:%u\n",
+	       j->space[journal_space_clean_ondisk].next_entry,
+	       j->space[journal_space_clean_ondisk].total);
+	prt_printf(out, "\tclean\t\t%u:%u\n",
+	       j->space[journal_space_clean].next_entry,
+	       j->space[journal_space_clean].total);
+	prt_printf(out, "\ttotal\t\t%u:%u\n",
+	       j->space[journal_space_total].next_entry,
+	       j->space[journal_space_total].total);
+
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+			continue;
+
+		if (!ja->nr)
+			continue;
+
+		prt_printf(out, "dev %u:\n",		i);
+		prt_printf(out, "\tnr\t\t%u\n",		ja->nr);
+		prt_printf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
+		prt_printf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+		prt_printf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
+		prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
+		prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
+		prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
+	}
+
+	rcu_read_unlock();
+
+	--out->atomic;
+}
+
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+	spin_lock(&j->lock);
+	__bch2_journal_debug_to_text(out, j);
+	spin_unlock(&j->lock);
+}
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *pin;
+	unsigned i;
+
+	spin_lock(&j->lock);
+	*seq = max(*seq, j->pin.front);
+
+	if (*seq >= j->pin.back) {
+		spin_unlock(&j->lock);
+		return true;
+	}
+
+	out->atomic++;
+
+	pin_list = journal_seq_pin(j, *seq);
+
+	prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+		list_for_each_entry(pin, &pin_list->list[i], list) {
+			prt_printf(out, "\t%px %ps", pin, pin->flush);
+			prt_newline(out);
+		}
+
+	if (!list_empty(&pin_list->flushed)) {
+		prt_printf(out, "flushed:");
+		prt_newline(out);
+	}
+
+	list_for_each_entry(pin, &pin_list->flushed, list) {
+		prt_printf(out, "\t%px %ps", pin, pin->flush);
+		prt_newline(out);
+	}
+
+	printbuf_indent_sub(out, 2);
+
+	--out->atomic;
+	spin_unlock(&j->lock);
+
+	return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+	u64 seq = 0;
+
+	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+		seq++;
+}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
new file mode 100644
index 000000000000..2f768e11aec9
--- /dev/null
+++ b/fs/bcachefs/journal.h
@@ -0,0 +1,450 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will wait on the journal write to
+ * complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+	closure_wake_up(&j->preres_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	return atomic64_read(&j->seq);
+}
+
+static inline u64 journal_last_unwritten_seq(struct journal *j)
+{
+	return j->seq_ondisk + 1;
+}
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	switch (idx) {
+	case 0: return s.buf0_count;
+	case 1: return s.buf1_count;
+	case 2: return s.buf2_count;
+	case 3: return s.buf3_count;
+	}
+	BUG();
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+	s->buf2_count += s->idx == 2;
+	s->buf3_count += s->idx == 3;
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline int journal_entry_overhead(struct journal *j)
+{
+	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s = cpu_to_le16(u64s);
+
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
+}
+
+static inline struct jset_entry *
+journal_res_entry(struct journal *j, struct journal_res *res)
+{
+	return vstruct_idx(j->buf[res->idx].data, res->offset);
+}
+
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  unsigned u64s)
+{
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	return jset_u64s(u64s);
+}
+
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+					  enum btree_id id, unsigned level,
+					  const void *data, unsigned u64s)
+{
+	unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+	memcpy_u64s_small(entry->_data, data, u64s);
+	return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+			 unsigned type, enum btree_id id,
+			 unsigned level, unsigned u64s)
+{
+	struct jset_entry *entry = journal_res_entry(j, res);
+	unsigned actual = journal_entry_init(entry, type, id, level, u64s);
+
+	EBUG_ON(!res->ref);
+	EBUG_ON(actual > res->u64s);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+	return entry;
+}
+
+static inline bool journal_entry_empty(struct jset *j)
+{
+	struct jset_entry *i;
+
+	if (j->seq != j->last_seq)
+		return false;
+
+	vstruct_for_each(j, i)
+		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
+			return false;
+	return true;
+}
+
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    .buf2_count = idx == 2,
+				    .buf3_count = idx == 3,
+				    }).v, &j->reservations.counter);
+	return s;
+}
+
+bool bch2_journal_entry_close(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx))
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+	union journal_res_state s;
+
+	s = journal_state_buf_put(j, idx);
+	if (!journal_state_count(s, idx)) {
+		spin_lock(&j->lock);
+		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		spin_unlock(&j->lock);
+	}
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, _THIS_IP_);
+
+	while (res->u64s)
+		bch2_journal_add_entry(j, res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       0, 0, 0);
+
+	bch2_journal_buf_put(j, res->idx, res->seq);
+
+	res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				  unsigned);
+
+/* First bits for BCH_WATERMARK: */
+enum journal_res_flags {
+	__JOURNAL_RES_GET_NONBLOCK	= BCH_WATERMARK_BITS,
+	__JOURNAL_RES_GET_CHECK,
+};
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << __JOURNAL_RES_GET_NONBLOCK)
+#define JOURNAL_RES_GET_CHECK		(1 << __JOURNAL_RES_GET_CHECK)
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned flags)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
+			return 0;
+
+		EBUG_ON(!journal_state_count(new, new.idx));
+
+		if ((flags & BCH_WATERMARK_MASK) < j->watermark)
+			return 0;
+
+		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
+
+		/*
+		 * If the refcount would overflow, we have to wait:
+		 * XXX - tracepoint this:
+		 */
+		if (!journal_state_count(new, new.idx))
+			return 0;
+
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
+	return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+				       unsigned u64s, unsigned flags)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
+		goto out;
+
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
+	if (ret)
+		return ret;
+out:
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0,
+				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+				    NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
+	return 0;
+}
+
+/* journal_entry_res: */
+
+void bch2_journal_entry_res_resize(struct journal *,
+				   struct journal_entry_res *,
+				   unsigned);
+
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+
+void bch2_fs_journal_stop(struct journal *);
+int bch2_fs_journal_start(struct journal *, u64);
+
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
new file mode 100644
index 000000000000..3eb6c3f62a81
--- /dev/null
+++ b/fs/bcachefs/journal_io.c
@@ -0,0 +1,1966 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "trace.h"
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+{
+	return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
+		!bch2_crc_cmp(j->csum,
+			      csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+}
+
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
+{
+	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+				  struct journal_replay *i)
+{
+	struct journal_replay **p =
+		genradix_ptr(&c->journal_entries,
+			     journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
+
+	BUG_ON(*p != i);
+	*p = NULL;
+	kvpfree(i, offsetof(struct journal_replay, j) +
+		vstruct_bytes(&i->j));
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+	i->ignore = true;
+
+	if (!c->opts.read_entire_journal)
+		__journal_replay_free(c, i);
+}
+
+struct journal_list {
+	struct closure		cl;
+	u64			last_seq;
+	struct mutex		lock;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_ptr entry_ptr,
+			     struct journal_list *jlist, struct jset *j)
+{
+	struct genradix_iter iter;
+	struct journal_replay **_i, *i, *dup;
+	struct journal_ptr *ptr;
+	size_t bytes = vstruct_bytes(j);
+	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	int ret = JOURNAL_ENTRY_ADD_OK;
+
+	/* Is this entry older than the range we need? */
+	if (!c->opts.read_entire_journal &&
+	    le64_to_cpu(j->seq) < jlist->last_seq)
+		return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
+	/*
+	 * genradixes are indexed by a ulong, not a u64, so we can't index them
+	 * by sequence number directly: Assume instead that they will all fall
+	 * within the range of +-2billion of the filrst one we find.
+	 */
+	if (!c->journal_entries_base_seq)
+		c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+	/* Drop entries we don't need anymore */
+	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+		genradix_for_each_from(&c->journal_entries, iter, _i,
+				       journal_entry_radix_idx(c, jlist->last_seq)) {
+			i = *_i;
+
+			if (!i || i->ignore)
+				continue;
+
+			if (le64_to_cpu(i->j.seq) >= last_seq)
+				break;
+			journal_replay_free(c, i);
+		}
+	}
+
+	jlist->last_seq = max(jlist->last_seq, last_seq);
+
+	_i = genradix_ptr_alloc(&c->journal_entries,
+				journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+				GFP_KERNEL);
+	if (!_i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	/*
+	 * Duplicate journal entries? If so we want the one that didn't have a
+	 * checksum error:
+	 */
+	dup = *_i;
+	if (dup) {
+		if (bytes == vstruct_bytes(&dup->j) &&
+		    !memcmp(j, &dup->j, bytes)) {
+			i = dup;
+			goto found;
+		}
+
+		if (!entry_ptr.csum_good) {
+			i = dup;
+			goto found;
+		}
+
+		if (!dup->csum_good)
+			goto replace;
+
+		fsck_err(c, journal_entry_replicas_data_mismatch,
+			 "found duplicate but non identical journal entries (seq %llu)",
+			 le64_to_cpu(j->seq));
+		i = dup;
+		goto found;
+	}
+replace:
+	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i)
+		return -BCH_ERR_ENOMEM_journal_entry_add;
+
+	i->nr_ptrs	= 0;
+	i->csum_good	= entry_ptr.csum_good;
+	i->ignore	= false;
+	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
+
+	if (dup) {
+		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
+			bch_err(c, "found too many copies of journal entry %llu",
+				le64_to_cpu(i->j.seq));
+			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
+		}
+
+		/* The first ptr should represent the jset we kept: */
+		memcpy(i->ptrs + i->nr_ptrs,
+		       dup->ptrs,
+		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
+		i->nr_ptrs += dup->nr_ptrs;
+		__journal_replay_free(c, dup);
+	}
+
+	*_i = i;
+	return 0;
+found:
+	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+		if (ptr->dev == ca->dev_idx) {
+			bch_err(c, "duplicate journal entry %llu on same device",
+				le64_to_cpu(i->j.seq));
+			goto out;
+		}
+	}
+
+	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+		bch_err(c, "found too many copies of journal entry %llu",
+			le64_to_cpu(i->j.seq));
+		goto out;
+	}
+
+	i->ptrs[i->nr_ptrs++] = entry_ptr;
+out:
+fsck_err:
+	return ret;
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = vstruct_next(entry))
+		memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+static void journal_entry_err_msg(struct printbuf *out,
+				  u32 version,
+				  struct jset *jset,
+				  struct jset_entry *entry)
+{
+	prt_str(out, "invalid journal entry, version=");
+	bch2_version_to_text(out, version);
+
+	if (entry) {
+		prt_str(out, " type=");
+		prt_str(out, bch2_jset_entry_types[entry->type]);
+	}
+
+	if (!jset) {
+		prt_printf(out, " in superblock");
+	} else {
+
+		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+		if (entry)
+			prt_printf(out, " offset=%zi/%u",
+				   (u64 *) entry - jset->_data,
+				   le32_to_cpu(jset->u64s));
+	}
+
+	prt_str(out, ": ");
+}
+
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...)	\
+({									\
+	struct printbuf _buf = PRINTBUF;				\
+									\
+	journal_entry_err_msg(&_buf, version, jset, entry);		\
+	prt_printf(&_buf, msg, ##__VA_ARGS__);				\
+									\
+	switch (flags & BKEY_INVALID_WRITE) {				\
+	case READ:							\
+		mustfix_fsck_err(c, _err, "%s", _buf.buf);		\
+		break;							\
+	case WRITE:							\
+		bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);		\
+		bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = -BCH_ERR_fsck_errors_not_fixed;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+									\
+	printbuf_exit(&_buf);						\
+	true;								\
+})
+
+#define journal_entry_err_on(cond, ...)					\
+	((cond) ? journal_entry_err(__VA_ARGS__) : false)
+
+#define FSCK_DELETED_KEY	5
+
+static int journal_validate_key(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned level, enum btree_id btree_id,
+				struct bkey_i *k,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	int write = flags & BKEY_INVALID_WRITE;
+	void *next = vstruct_next(entry);
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (journal_entry_err_on(!k->k.u64s,
+				 c, version, jset, entry,
+				 journal_entry_bkey_u64s_0,
+				 "k->u64s 0")) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (journal_entry_err_on((void *) bkey_next(k) >
+				 (void *) vstruct_next(entry),
+				 c, version, jset, entry,
+				 journal_entry_bkey_past_end,
+				 "extends past end of journal entry")) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
+				 c, version, jset, entry,
+				 journal_entry_bkey_bad_format,
+				 "bad format %u", k->k.format)) {
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (!write)
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
+
+	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+			      __btree_node_type(level, btree_id), write, &buf)) {
+		printbuf_reset(&buf);
+		journal_entry_err_msg(&buf, version, jset, entry);
+		prt_newline(&buf);
+		printbuf_indent_add(&buf, 2);
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		prt_newline(&buf);
+		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+				  __btree_node_type(level, btree_id), write, &buf);
+
+		mustfix_fsck_err(c, journal_entry_bkey_invalid,
+				 "%s", buf.buf);
+
+		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+
+		printbuf_exit(&buf);
+		return FSCK_DELETED_KEY;
+	}
+
+	if (write)
+		bch2_bkey_compat(level, btree_id, version, big_endian,
+				 write, NULL, bkey_to_packed(k));
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct bkey_i *k = entry->start;
+
+	while (k != vstruct_last(entry)) {
+		int ret = journal_validate_key(c, jset, entry,
+					       entry->level,
+					       entry->btree_id,
+					       k, version, big_endian,
+					       flags|BKEY_INVALID_JOURNAL);
+		if (ret == FSCK_DELETED_KEY)
+			continue;
+
+		k = bkey_next(k);
+	}
+
+	return 0;
+}
+
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct bkey_i *k;
+	bool first = true;
+
+	jset_entry_for_each_key(entry, k) {
+		if (!first) {
+			prt_newline(out);
+			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		}
+		prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
+		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+		first = false;
+	}
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct bkey_i *k = entry->start;
+	int ret = 0;
+
+	if (journal_entry_err_on(!entry->u64s ||
+				 le16_to_cpu(entry->u64s) != k->k.u64s,
+				 c, version, jset, entry,
+				 journal_entry_btree_root_bad_size,
+				 "invalid btree root journal entry: wrong number of keys")) {
+		void *next = vstruct_next(entry);
+		/*
+		 * we don't want to null out this jset_entry,
+		 * just the contents, so that later we can tell
+		 * we were _supposed_ to have a btree root
+		 */
+		entry->u64s = 0;
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
+				   version, big_endian, flags);
+	if (ret == FSCK_DELETED_KEY)
+		ret = 0;
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	/* obsolete, don't care: */
+	return 0;
+}
+
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
+				 c, version, jset, entry,
+				 journal_entry_blacklist_bad_size,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_blacklist *bl =
+		container_of(entry, struct jset_entry_blacklist, entry);
+
+	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_blacklist_v2 *bl_entry;
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
+				 c, version, jset, entry,
+				 journal_entry_blacklist_v2_bad_size,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+				 le64_to_cpu(bl_entry->end),
+				 c, version, jset, entry,
+				 journal_entry_blacklist_v2_start_past_end,
+		"invalid journal seq blacklist entry: start > end")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+out:
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+					       struct jset_entry *entry)
+{
+	struct jset_entry_blacklist_v2 *bl =
+		container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	prt_printf(out, "start=%llu end=%llu",
+	       le64_to_cpu(bl->start),
+	       le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u),
+				 c, version, jset, entry,
+				 journal_entry_usage_bad_size,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_usage *u =
+		container_of(entry, struct jset_entry_usage, entry);
+
+	prt_printf(out, "type=%s v=%llu",
+	       bch2_fs_usage_types[u->entry.btree_id],
+	       le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	struct printbuf err = PRINTBUF;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < sizeof(*u) ||
+				 bytes < sizeof(*u) + u->r.nr_devs,
+				 c, version, jset, entry,
+				 journal_entry_data_usage_bad_size,
+				 "invalid journal entry usage: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+
+	if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+				 c, version, jset, entry,
+				 journal_entry_data_usage_bad_size,
+				 "invalid journal entry usage: %s", err.buf)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		goto out;
+	}
+out:
+fsck_err:
+	printbuf_exit(&err);
+	return ret;
+}
+
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					     struct jset_entry *entry)
+{
+	struct jset_entry_data_usage *u =
+		container_of(entry, struct jset_entry_data_usage, entry);
+
+	bch2_replicas_entry_to_text(out, &u->r);
+	prt_printf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, version, jset, entry,
+				 journal_entry_clock_bad_size,
+				 "bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, version, jset, entry,
+				 journal_entry_clock_bad_rw,
+				 "bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+					struct jset_entry *entry)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+
+	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u);
+	unsigned dev;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	dev = le32_to_cpu(u->dev);
+
+	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_dev,
+				 "bad dev")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_pad,
+				 "bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+
+	for (i = 0; i < nr_types; i++) {
+		if (i < BCH_DATA_NR)
+			prt_printf(out, " %s", bch2_data_types[i]);
+		else
+			prt_printf(out, " (unknown data type %u)", i);
+		prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+		       le64_to_cpu(u->d[i].buckets),
+		       le64_to_cpu(u->d[i].sectors),
+		       le64_to_cpu(u->d[i].fragmented));
+	}
+
+	prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+	prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+struct jset_entry_ops {
+	int (*validate)(struct bch_fs *, struct jset *,
+			struct jset_entry *, unsigned, int,
+			enum bkey_invalid_flags);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
+};
+
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)						\
+	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
+		.validate	= journal_entry_##f##_validate,	\
+		.to_text	= journal_entry_##f##_to_text,	\
+	},
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+int bch2_journal_entry_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return entry->type < BCH_JSET_ENTRY_NR
+		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
+				version, big_endian, flags)
+		: 0;
+}
+
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+				struct jset_entry *entry)
+{
+	if (entry->type < BCH_JSET_ENTRY_NR) {
+		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+	} else {
+		prt_printf(out, "(unknown type %u)", entry->type);
+	}
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+				 enum bkey_invalid_flags flags)
+{
+	struct jset_entry *entry;
+	unsigned version = le32_to_cpu(jset->version);
+	int ret = 0;
+
+	vstruct_for_each(jset, entry) {
+		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+				c, version, jset, entry,
+				journal_entry_past_jset_end,
+				"journal entry extends past end of jset")) {
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+			break;
+		}
+
+		ret = bch2_journal_entry_validate(c, jset, entry,
+					version, JSET_BIG_ENDIAN(jset), flags);
+		if (ret)
+			break;
+	}
+fsck_err:
+	return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 enum bkey_invalid_flags flags)
+{
+	unsigned version;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
+			jset_unsupported_version,
+			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
+
+	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
+			c, version, jset, NULL,
+			jset_unknown_csum,
+			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			JSET_CSUM_TYPE(jset)))
+		ret = JOURNAL_ENTRY_BAD;
+
+	/* last_seq is ignored when JSET_NO_FLUSH is true */
+	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
+				 c, version, jset, NULL,
+				 jset_last_seq_newer_than_seq,
+				 "invalid journal entry: last_seq > seq (%llu > %llu)",
+				 le64_to_cpu(jset->last_seq),
+				 le64_to_cpu(jset->seq))) {
+		jset->last_seq = jset->seq;
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	ret = jset_validate_entries(c, jset, flags);
+fsck_err:
+	return ret;
+}
+
+static int jset_validate_early(struct bch_fs *c,
+			 struct bch_dev *ca,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read)
+{
+	size_t bytes = vstruct_bytes(jset);
+	unsigned version;
+	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	version = le32_to_cpu(jset->version);
+	if (journal_entry_err_on(!bch2_version_compatible(version),
+			c, version, jset, NULL,
+			jset_unsupported_version,
+			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq),
+			BCH_VERSION_MAJOR(version),
+			BCH_VERSION_MINOR(version))) {
+		/* don't try to continue: */
+		return -EINVAL;
+	}
+
+	if (bytes > (sectors_read << 9) &&
+	    sectors_read < bucket_sectors_left)
+		return JOURNAL_ENTRY_REREAD;
+
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
+			c, version, jset, NULL,
+			jset_past_bucket_end,
+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
+			ca ? ca->name : c->name,
+			sector, le64_to_cpu(jset->seq), bytes))
+		le32_add_cpu(&jset->u64s,
+			     -((bytes - (bucket_sectors_left << 9)) / 8));
+fsck_err:
+	return ret;
+}
+
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	/* the bios are sized for this many pages, max: */
+	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = kvpmalloc(new_size, GFP_KERNEL);
+	if (!n)
+		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+
+	kvpfree(b->data, b->size);
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
+			       unsigned bucket)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
+	bool saw_bad = false, csum_good;
+	int ret = 0;
+
+	pr_debug("reading %u", bucket);
+
+	while (offset < end) {
+		if (!sectors_read) {
+			struct bio *bio;
+			unsigned nr_bvecs;
+reread:
+			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
+			nr_bvecs = buf_pages(buf->data, sectors_read << 9);
+
+			bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+			bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
+
+			bio->bi_iter.bi_sector = offset;
+			bch2_bio_map(bio, buf->data, sectors_read << 9);
+
+			ret = submit_bio_wait(bio);
+			kfree(bio);
+
+			if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
+					       "journal read error: sector %llu",
+					       offset) ||
+			    bch2_meta_read_fault("journal")) {
+				/*
+				 * We don't error out of the recovery process
+				 * here, since the relevant journal entry may be
+				 * found on a different device, and missing or
+				 * no journal entries will be handled later
+				 */
+				return 0;
+			}
+
+			j = buf->data;
+		}
+
+		ret = jset_validate_early(c, ca, j, offset,
+				    end - offset, sectors_read);
+		switch (ret) {
+		case 0:
+			sectors = vstruct_sectors(j, c->block_bits);
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
+			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			/*
+			 * On checksum error we don't really trust the size
+			 * field of the journal entry we read, so try reading
+			 * again at next block boundary:
+			 */
+			sectors = block_sectors(c);
+			goto next_block;
+		default:
+			return ret;
+		}
+
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+		csum_good = jset_csum_good(c, j);
+		if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+				       "journal checksum error"))
+			saw_bad = true;
+
+		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+			     j->encrypted_start,
+			     vstruct_end(j) - (void *) j->encrypted_start);
+		bch2_fs_fatal_err_on(ret, c,
+				"error decrypting journal entry: %i", ret);
+
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, (struct journal_ptr) {
+					.csum_good	= csum_good,
+					.dev		= ca->dev_idx,
+					.bucket		= bucket,
+					.bucket_offset	= offset -
+						bucket_to_sector(ca, ja->buckets[bucket]),
+					.sector		= offset,
+					}, jlist, j);
+		mutex_unlock(&jlist->lock);
+
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
+		}
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
+	}
+
+	return 0;
+}
+
+static CLOSURE_CALLBACK(bch2_journal_read_device)
+{
+	closure_type(ja, struct journal_device, read);
+	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct bch_fs *c = ca->fs;
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct journal_replay *r, **_r;
+	struct genradix_iter iter;
+	struct journal_read_buf buf = { NULL, 0 };
+	unsigned i;
+	int ret = 0;
+
+	if (!ja->nr)
+		goto out;
+
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
+
+	pr_debug("%u journal buckets", ja->nr);
+
+	for (i = 0; i < ja->nr; i++) {
+		ret = journal_read_bucket(ca, &buf, jlist, i);
+		if (ret)
+			goto err;
+	}
+
+	ja->sectors_free = ca->mi.bucket_size;
+
+	mutex_lock(&jlist->lock);
+	genradix_for_each_reverse(&c->journal_entries, iter, _r) {
+		r = *_r;
+
+		if (!r)
+			continue;
+
+		for (i = 0; i < r->nr_ptrs; i++) {
+			if (r->ptrs[i].dev == ca->dev_idx) {
+				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+					vstruct_sectors(&r->j, c->block_bits);
+
+				ja->cur_idx = r->ptrs[i].bucket;
+				ja->sectors_free = ca->mi.bucket_size - wrote;
+				goto found;
+			}
+		}
+	}
+found:
+	mutex_unlock(&jlist->lock);
+
+	if (ja->bucket_seq[ja->cur_idx] &&
+	    ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+		/*
+		 * Debug code for ZNS support, where we (probably) want to be
+		 * correlated where we stopped in the journal to the zone write
+		 * points:
+		 */
+		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
+		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
+		for (i = 0; i < 3; i++) {
+			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
+			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
+		}
+#endif
+		ja->sectors_free = 0;
+	}
+
+	/*
+	 * Set dirty_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->discard_idx = ja->dirty_idx_ondisk =
+		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
+out:
+	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
+	kvpfree(buf.data, buf.size);
+	percpu_ref_put(&ca->io_ref);
+	closure_return(cl);
+	return;
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	unsigned i;
+
+	for (i = 0; i < j->nr_ptrs; i++) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
+		u64 offset;
+
+		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
+
+		if (i)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+		       j->ptrs[i].dev,
+		       j->ptrs[i].bucket,
+		       j->ptrs[i].bucket_offset,
+		       j->ptrs[i].sector);
+	}
+}
+
+int bch2_journal_read(struct bch_fs *c,
+		      u64 *last_seq,
+		      u64 *blacklist_seq,
+		      u64 *start_seq)
+{
+	struct journal_list jlist;
+	struct journal_replay *i, **_i, *prev = NULL;
+	struct genradix_iter radix_iter;
+	struct bch_dev *ca;
+	unsigned iter;
+	struct printbuf buf = PRINTBUF;
+	bool degraded = false, last_write_torn = false;
+	u64 seq;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	jlist.last_seq = 0;
+	jlist.ret = 0;
+
+	for_each_member_device(ca, c, iter) {
+		if (!c->opts.fsck &&
+		    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
+			continue;
+
+		if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+		     ca->mi.state == BCH_MEMBER_STATE_ro) &&
+		    percpu_ref_tryget(&ca->io_ref))
+			closure_call(&ca->journal.read,
+				     bch2_journal_read_device,
+				     system_unbound_wq,
+				     &jlist.cl);
+		else
+			degraded = true;
+	}
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	*last_seq	= 0;
+	*start_seq	= 0;
+	*blacklist_seq	= 0;
+
+	/*
+	 * Find most recent flush entry, and ignore newer non flush entries -
+	 * those entries will be blacklisted:
+	 */
+	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+		enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
+
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		if (!*start_seq)
+			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+		if (JSET_NO_FLUSH(&i->j)) {
+			i->ignore = true;
+			continue;
+		}
+
+		if (!last_write_torn && !i->csum_good) {
+			last_write_torn = true;
+			i->ignore = true;
+			continue;
+		}
+
+		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+					 c, le32_to_cpu(i->j.version), &i->j, NULL,
+					 jset_last_seq_newer_than_seq,
+					 "invalid journal entry: last_seq > seq (%llu > %llu)",
+					 le64_to_cpu(i->j.last_seq),
+					 le64_to_cpu(i->j.seq)))
+			i->j.last_seq = i->j.seq;
+
+		*last_seq	= le64_to_cpu(i->j.last_seq);
+		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+		break;
+	}
+
+	if (!*start_seq) {
+		bch_info(c, "journal read done, but no entries found");
+		return 0;
+	}
+
+	if (!*last_seq) {
+		fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+			 "journal read done, but no entries found after dropping non-flushes");
+		return 0;
+	}
+
+	bch_info(c, "journal read done, replaying entries %llu-%llu",
+		 *last_seq, *blacklist_seq - 1);
+
+	if (*start_seq != *blacklist_seq)
+		bch_info(c, "dropped unflushed entries %llu-%llu",
+			 *blacklist_seq, *start_seq - 1);
+
+	/* Drop blacklisted entries and entries older than last_seq: */
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		seq = le64_to_cpu(i->j.seq);
+		if (seq < *last_seq) {
+			journal_replay_free(c, i);
+			continue;
+		}
+
+		if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+				    jset_seq_blacklisted,
+				    "found blacklisted journal entry %llu", seq);
+			i->ignore = true;
+		}
+	}
+
+	/* Check for missing entries: */
+	seq = *last_seq;
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		i = *_i;
+
+		if (!i || i->ignore)
+			continue;
+
+		BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+		while (seq < le64_to_cpu(i->j.seq)) {
+			u64 missing_start, missing_end;
+			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (seq == le64_to_cpu(i->j.seq))
+				break;
+
+			missing_start = seq;
+
+			while (seq < le64_to_cpu(i->j.seq) &&
+			       !bch2_journal_seq_is_blacklisted(c, seq, false))
+				seq++;
+
+			if (prev) {
+				bch2_journal_ptrs_to_text(&buf1, c, prev);
+				prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+			} else
+				prt_printf(&buf1, "(none)");
+			bch2_journal_ptrs_to_text(&buf2, c, i);
+
+			missing_end = seq - 1;
+			fsck_err(c, journal_entries_missing,
+				 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+				 "  prev at %s\n"
+				 "  next at %s",
+				 missing_start, missing_end,
+				 *last_seq, *blacklist_seq - 1,
+				 buf1.buf, buf2.buf);
+
+			printbuf_exit(&buf1);
+			printbuf_exit(&buf2);
+		}
+
+		prev = i;
+		seq++;
+	}
+
+	genradix_for_each(&c->journal_entries, radix_iter, _i) {
+		struct bch_replicas_padded replicas = {
+			.e.data_type = BCH_DATA_journal,
+			.e.nr_required = 1,
+		};
+		unsigned ptr;
+
+		i = *_i;
+		if (!i || i->ignore)
+			continue;
+
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
+			ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+
+			if (!i->ptrs[ptr].csum_good)
+				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+						   "invalid journal checksum, seq %llu%s",
+						   le64_to_cpu(i->j.seq),
+						   i->csum_good ? " (had good copy on another device)" : "");
+		}
+
+		ret = jset_validate(c,
+				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    &i->j,
+				    i->ptrs[0].sector,
+				    READ);
+		if (ret)
+			goto err;
+
+		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+		bch2_replicas_entry_sort(&replicas.e);
+
+		printbuf_reset(&buf);
+		bch2_replicas_entry_to_text(&buf, &replicas.e);
+
+		if (!degraded &&
+		    !bch2_replicas_marked(c, &replicas.e) &&
+		    (le64_to_cpu(i->j.seq) == *last_seq ||
+		     fsck_err(c, journal_entry_replicas_not_marked,
+			      "superblock not marked as containing replicas for journal entry %llu\n  %s",
+			      le64_to_cpu(i->j.seq), buf.buf))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
+			if (ret)
+				goto err;
+		}
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/* journal write: */
+
+static void __journal_write_alloc(struct journal *j,
+				  struct journal_buf *w,
+				  struct dev_alloc_list *devs_sorted,
+				  unsigned sectors,
+				  unsigned *replicas,
+				  unsigned replicas_want)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	unsigned i;
+
+	if (*replicas >= replicas_want)
+		return;
+
+	for (i = 0; i < devs_sorted->nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (!ca->mi.durability ||
+		    ca->mi.state != BCH_MEMBER_STATE_rw ||
+		    !ja->nr ||
+		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
+		    sectors > ja->sectors_free)
+			continue;
+
+		bch2_dev_stripe_increment(ca, &j->wp.stripe);
+
+		bch2_bkey_append_ptr(&w->key,
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]) +
+					ca->mi.bucket_size -
+					ja->sectors_free,
+				  .dev = ca->dev_idx,
+		});
+
+		ja->sectors_free -= sectors;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		*replicas += ca->mi.durability;
+
+		if (*replicas >= replicas_want)
+			break;
+	}
+}
+
+/**
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:		journal object
+ * @w:		journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_devs_mask devs;
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+	unsigned target = c->opts.metadata_target ?:
+		c->opts.foreground_target;
+	unsigned i, replicas = 0, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	rcu_read_lock();
+retry:
+	devs = target_rw_devs(c, BCH_DATA_journal, target);
+
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas >= replicas_want)
+		goto done;
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		ja = &ca->journal;
+
+		if (sectors > ja->sectors_free &&
+		    sectors <= ca->mi.bucket_size &&
+		    bch2_journal_dev_buckets_available(j, ja,
+					journal_space_discarded)) {
+			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+			ja->sectors_free = ca->mi.bucket_size;
+
+			/*
+			 * ja->bucket_seq[ja->cur_idx] must always have
+			 * something sensible:
+			 */
+			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+		}
+	}
+
+	__journal_write_alloc(j, w, &devs_sorted,
+			      sectors, &replicas, replicas_want);
+
+	if (replicas < replicas_want && target) {
+		/* Retry from all devices: */
+		target = 0;
+		goto retry;
+	}
+done:
+	rcu_read_unlock();
+
+	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
+	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+	/* we aren't holding j->lock: */
+	unsigned new_size = READ_ONCE(j->buf_size_want);
+	void *new_buf;
+
+	if (buf->buf_size >= new_size)
+		return;
+
+	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+	if (!new_buf)
+		return;
+
+	memcpy(new_buf, buf->data, buf->buf_size);
+
+	spin_lock(&j->lock);
+	swap(buf->data,		new_buf);
+	swap(buf->buf_size,	new_size);
+	spin_unlock(&j->lock);
+
+	kvpfree(new_buf, new_size);
+}
+
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
+}
+
+static CLOSURE_CALLBACK(journal_write_done)
+{
+	closure_type(j, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
+	union journal_res_state old, new;
+	u64 v, seq;
+	int err = 0;
+
+	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+			       ? j->flush_write_time
+			       : j->noflush_write_time, j->write_start_time);
+
+	if (!w->devs_written.nr) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		err = -EIO;
+	} else {
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 w->devs_written);
+		if (bch2_mark_replicas(c, &replicas.e))
+			err = -EIO;
+	}
+
+	if (err)
+		bch2_fatal_error(c);
+
+	spin_lock(&j->lock);
+	seq = le64_to_cpu(w->data->seq);
+
+	if (seq >= j->pin.front)
+		journal_seq_pin(j, seq)->devs = w->devs_written;
+
+	if (!err) {
+		if (!JSET_NO_FLUSH(w->data)) {
+			j->flushed_seq_ondisk = seq;
+			j->last_seq_ondisk = w->last_seq;
+
+			bch2_do_discards(c);
+			closure_wake_up(&c->freelist_wait);
+
+			bch2_reset_alloc_cursors(c);
+		}
+	} else if (!j->err_seq || seq < j->err_seq)
+		j->err_seq	= seq;
+
+	j->seq_ondisk		= seq;
+
+	/*
+	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+	 * more buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	if (j->watermark != BCH_WATERMARK_stripe)
+		journal_reclaim_kick(&c->journal);
+
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	v = atomic64_read(&j->reservations.counter);
+	do {
+		old.v = new.v = v;
+		BUG_ON(journal_state_count(new, new.unwritten_idx));
+
+		new.unwritten_idx++;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	bch2_journal_reclaim_fast(j);
+	bch2_journal_space_available(j);
+
+	closure_wake_up(&w->wait);
+	journal_wake(j);
+
+	if (!journal_state_count(new, new.unwritten_idx) &&
+	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+		spin_unlock(&j->lock);
+		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+		struct journal_buf *buf = journal_cur_buf(j);
+		long delta = buf->expires - jiffies;
+
+		/*
+		 * We don't close a journal entry to write it while there's
+		 * previous entries still in flight - the current journal entry
+		 * might want to be written now:
+		 */
+
+		spin_unlock(&j->lock);
+		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+	} else {
+		spin_unlock(&j->lock);
+	}
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	unsigned long flags;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+			       "error writing journal entry %llu: %s",
+			       le64_to_cpu(w->data->seq),
+			       bch2_blk_status_to_str(bio->bi_status)) ||
+	    bch2_meta_write_fault("journal")) {
+		spin_lock_irqsave(&j->err_lock, flags);
+		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
+		spin_unlock_irqrestore(&j->err_lock, flags);
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static CLOSURE_CALLBACK(do_journal_write)
+{
+	closure_type(j, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_extent_ptr *ptr;
+	struct bio *bio;
+	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+
+		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+		ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+		if (!JSET_NO_FLUSH(w->data))
+			bio->bi_opf    |= REQ_FUA;
+		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+			bio->bi_opf    |= REQ_PREFLUSH;
+
+		bch2_bio_map(bio, w->data, sectors << 9);
+
+		trace_and_count(c, journal_write, bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] =
+			le64_to_cpu(w->data->seq);
+	}
+
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+}
+
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct jset_entry *start, *end, *i, *next, *prev = NULL;
+	struct jset *jset = w->data;
+	unsigned sectors, bytes, u64s;
+	bool validate_before_checksum = false;
+	unsigned long btree_roots_have = 0;
+	int ret;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/*
+		 * New btree roots are set by journalling them; when the journal
+		 * entry gets written we have to propagate them to
+		 * c->btree_roots
+		 *
+		 * But, every journal entry we write has to contain all the
+		 * btree roots (at least for now); so after we copy btree roots
+		 * to c->btree_roots we have to get any missing btree roots and
+		 * add them to this journal entry:
+		 */
+		if (i->type == BCH_JSET_ENTRY_btree_root) {
+			bch2_journal_entry_to_btree_root(c, i);
+			__set_bit(i->btree_id, &btree_roots_have);
+		}
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+
+	start = end = vstruct_last(jset);
+
+	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
+
+	bch2_journal_super_entries_add_common(c, &end,
+				le64_to_cpu(jset->seq));
+	u64s	= (u64 *) end - (u64 *) start;
+	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	le32_add_cpu(&jset->u64s, u64s);
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	bytes	= vstruct_bytes(jset);
+
+	if (sectors > w->sectors) {
+		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+				    vstruct_bytes(jset), w->sectors << 9,
+				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
+		return -EINVAL;
+	}
+
+	jset->magic		= cpu_to_le64(jset_magic(c));
+	jset->version		= cpu_to_le32(c->sb.version);
+
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
+		j->last_empty_seq = le64_to_cpu(jset->seq);
+
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
+		validate_before_checksum = true;
+
+	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
+		validate_before_checksum = true;
+
+	if (validate_before_checksum &&
+	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+		return ret;
+
+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
+	if (bch2_fs_fatal_err_on(ret, c,
+			"error decrypting journal entry: %i", ret))
+		return ret;
+
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	if (!validate_before_checksum &&
+	    (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+		return ret;
+
+	memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+	return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	int error = bch2_journal_error(j);
+
+	/*
+	 * If the journal is in an error state - we did an emergency shutdown -
+	 * we prefer to continue doing journal writes. We just mark them as
+	 * noflush so they'll never be used, but they'll still be visible by the
+	 * list_journal tool - this helps in debugging.
+	 *
+	 * There's a caveat: the first journal write after marking the
+	 * superblock dirty must always be a flush write, because on startup
+	 * from a clean shutdown we didn't necessarily read the journal and the
+	 * new journal write might overwrite whatever was in the journal
+	 * previously - we can't leave the journal without any flush writes in
+	 * it.
+	 *
+	 * So if we're in an error state, and we're still starting up, we don't
+	 * write anything at all.
+	 */
+	if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+		return -EIO;
+
+	if (error ||
+	    w->noflush ||
+	    (!w->must_flush &&
+	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+		     w->noflush = true;
+		SET_JSET_NO_FLUSH(w->data, true);
+		w->data->last_seq	= 0;
+		w->last_seq		= 0;
+
+		j->nr_noflush_writes++;
+	} else {
+		j->last_flush_write = jiffies;
+		j->nr_flush_writes++;
+		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+	}
+
+	return 0;
+}
+
+CLOSURE_CALLBACK(bch2_journal_write)
+{
+	closure_type(j, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_last_unwritten_buf(j);
+	struct bch_replicas_padded replicas;
+	struct bio *bio;
+	struct printbuf journal_debug_buf = PRINTBUF;
+	unsigned i, nr_rw_members = 0;
+	int ret;
+
+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+	j->write_start_time = local_clock();
+
+	spin_lock(&j->lock);
+	ret = bch2_journal_write_pick_flush(j, w);
+	spin_unlock(&j->lock);
+	if (ret)
+		goto err;
+
+	journal_buf_realloc(j, w);
+
+	ret = bch2_journal_write_prep(j, w);
+	if (ret)
+		goto err;
+
+	while (1) {
+		spin_lock(&j->lock);
+		ret = journal_write_alloc(j, w);
+		if (!ret || !j->can_discard)
+			break;
+
+		spin_unlock(&j->lock);
+		bch2_journal_do_discards(j);
+	}
+
+	if (ret) {
+		__bch2_journal_debug_to_text(&journal_debug_buf, j);
+		spin_unlock(&j->lock);
+		bch_err(c, "Unable to allocate journal write:\n%s",
+			journal_debug_buf.buf);
+		printbuf_exit(&journal_debug_buf);
+		goto err;
+	}
+
+	/*
+	 * write is allocated, no longer need to account for it in
+	 * bch2_journal_space_available():
+	 */
+	w->sectors = 0;
+
+	/*
+	 * journal entry has been compacted and allocated, recalculate space
+	 * available:
+	 */
+	bch2_journal_space_available(j);
+	spin_unlock(&j->lock);
+
+	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
+	if (c->opts.nochanges)
+		goto no_io;
+
+	for_each_rw_member(ca, c, i)
+		nr_rw_members++;
+
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
+	/*
+	 * Mark journal replicas before we submit the write to guarantee
+	 * recovery will find the journal entries after a crash.
+	 */
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+				 w->devs_written);
+	ret = bch2_mark_replicas(c, &replicas.e);
+	if (ret)
+		goto err;
+
+	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
+		for_each_rw_member(ca, c, i) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
+	}
+
+	continue_at(cl, do_journal_write, c->io_complete_wq);
+	return;
+no_io:
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+	return;
+err:
+	bch2_fatal_error(c);
+	continue_at(cl, journal_write_done, c->io_complete_wq);
+}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
new file mode 100644
index 000000000000..c035e7c108e1
--- /dev/null
+++ b/fs/bcachefs/journal_io.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct journal_ptr {
+		bool		csum_good;
+		u8		dev;
+		u32		bucket;
+		u32		bucket_offset;
+		u64		sector;
+	}			ptrs[BCH_REPLICAS_MAX];
+	unsigned		nr_ptrs;
+
+	bool			csum_good;
+	bool			ignore;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define jset_entry_for_each_key(_e, _k)					\
+	for (_k = (_e)->start;						\
+	     _k < vstruct_last(_e);					\
+	     _k = bkey_next(_k))
+
+#define for_each_jset_key(k, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\
+		jset_entry_for_each_key(entry, k)
+
+int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
+				struct jset_entry *, unsigned, int,
+				enum bkey_invalid_flags);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+				struct jset_entry *);
+
+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
+			       struct journal_replay *);
+
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
+
+CLOSURE_CALLBACK(bch2_journal_write);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
new file mode 100644
index 000000000000..ec712104addb
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "trace.h"
+
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+
+/* Free space calculations: */
+
+static unsigned journal_space_from(struct journal_device *ja,
+				   enum journal_space_from from)
+{
+	switch (from) {
+	case journal_space_discarded:
+		return ja->discard_idx;
+	case journal_space_clean_ondisk:
+		return ja->dirty_idx_ondisk;
+	case journal_space_clean:
+		return ja->dirty_idx;
+	default:
+		BUG();
+	}
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+					    struct journal_device *ja,
+					    enum journal_space_from from)
+{
+	unsigned available = (journal_space_from(ja, from) -
+			      ja->cur_idx - 1 + ja->nr) % ja->nr;
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
+		--available;
+
+	return available;
+}
+
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
+{
+	unsigned watermark = BCH_WATERMARK_stripe;
+
+	if (low_on_space)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+	if (fifo_free(&j->pin) < j->pin.size / 4)
+		watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+	if (watermark == j->watermark)
+		return;
+
+	swap(watermark, j->watermark);
+	if (watermark > j->watermark)
+		journal_wake(j);
+}
+
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+			    enum journal_space_from from)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned sectors, buckets, unwritten;
+	u64 seq;
+
+	if (from == journal_space_total)
+		return (struct journal_space) {
+			.next_entry	= ca->mi.bucket_size,
+			.total		= ca->mi.bucket_size * ja->nr,
+		};
+
+	buckets = bch2_journal_dev_buckets_available(j, ja, from);
+	sectors = ja->sectors_free;
+
+	/*
+	 * We that we don't allocate the space for a journal entry
+	 * until we write it out - thus, account for it here:
+	 */
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
+
+		if (!unwritten)
+			continue;
+
+		/* entry won't fit on this device, skip: */
+		if (unwritten > ca->mi.bucket_size)
+			continue;
+
+		if (unwritten >= sectors) {
+			if (!buckets) {
+				sectors = 0;
+				break;
+			}
+
+			buckets--;
+			sectors = ca->mi.bucket_size;
+		}
+
+		sectors -= unwritten;
+	}
+
+	if (sectors < ca->mi.bucket_size && buckets) {
+		buckets--;
+		sectors = ca->mi.bucket_size;
+	}
+
+	return (struct journal_space) {
+		.next_entry	= sectors,
+		.total		= sectors + buckets * ca->mi.bucket_size,
+	};
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+			    enum journal_space_from from)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned i, pos, nr_devs = 0;
+	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		if (!ca->journal.nr)
+			continue;
+
+		space = journal_dev_space_available(j, ca, from);
+		if (!space.next_entry)
+			continue;
+
+		for (pos = 0; pos < nr_devs; pos++)
+			if (space.total > dev_space[pos].total)
+				break;
+
+		array_insert_item(dev_space, nr_devs, pos, space);
+	}
+	rcu_read_unlock();
+
+	if (nr_devs < nr_devs_want)
+		return (struct journal_space) { 0, 0 };
+
+	/*
+	 * We sorted largest to smallest, and we want the smallest out of the
+	 * @nr_devs_want largest devices:
+	 */
+	return dev_space[nr_devs_want - 1];
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned clean, clean_ondisk, total;
+	unsigned max_entry_size	 = min(j->buf[0].buf_size >> 9,
+				       j->buf[1].buf_size >> 9);
+	unsigned i, nr_online = 0, nr_devs_want;
+	bool can_discard = false;
+	int ret = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_journal]) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (ja->dirty_idx != ja->cur_idx &&
+		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
+
+		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
+		       ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
+			ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
+
+		if (ja->discard_idx != ja->dirty_idx_ondisk)
+			can_discard = true;
+
+		max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	j->can_discard = can_discard;
+
+	if (nr_online < c->opts.metadata_replicas_required) {
+		ret = JOURNAL_ERR_insufficient_devices;
+		goto out;
+	}
+
+	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
+
+	for (i = 0; i < journal_space_nr; i++)
+		j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+	clean_ondisk	= j->space[journal_space_clean_ondisk].total;
+	clean		= j->space[journal_space_clean].total;
+	total		= j->space[journal_space_total].total;
+
+	if (!j->space[journal_space_discarded].next_entry)
+		ret = JOURNAL_ERR_journal_full;
+
+	if ((j->space[journal_space_clean_ondisk].next_entry <
+	     j->space[journal_space_clean_ondisk].total) &&
+	    (clean - clean_ondisk <= total / 8) &&
+	    (clean_ondisk * 2 > clean))
+		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+	else
+		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+	journal_set_watermark(j, clean * 4 <= total);
+out:
+	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
+	j->cur_entry_error	= ret;
+
+	if (!ret)
+		journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->discard_idx != ja->dirty_idx_ondisk;
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/*
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+void bch2_journal_do_discards(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	unsigned iter;
+
+	mutex_lock(&j->discard_lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!c->opts.nochanges &&
+			    ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->discard_idx]),
+					ca->mi.bucket_size, GFP_NOFS);
+
+			spin_lock(&j->lock);
+			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
+
+			bch2_journal_space_available(j);
+			spin_unlock(&j->lock);
+		}
+	}
+
+	mutex_unlock(&j->discard_lock);
+}
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!fifo_empty(&j->pin) &&
+	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
+		j->pin.front++;
+		popped = true;
+	}
+
+	if (popped)
+		bch2_journal_space_available(j);
+}
+
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+	return atomic_dec_and_test(&pin_list->count);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+	if (__bch2_journal_pin_put(j, seq)) {
+		spin_lock(&j->lock);
+		bch2_journal_reclaim_fast(j);
+		spin_unlock(&j->lock);
+	}
+}
+
+static inline bool __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list;
+
+	if (!journal_pin_active(pin))
+		return false;
+
+	if (j->flush_in_progress == pin)
+		j->flush_in_progress_dropped = true;
+
+	pin_list = journal_seq_pin(j, pin->seq);
+	pin->seq = 0;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	return atomic_dec_and_test(&pin_list->count) &&
+		pin_list == &fifo_peek_front(&j->pin);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			   struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	if (__journal_pin_drop(j, pin))
+		bch2_journal_reclaim_fast(j);
+	spin_unlock(&j->lock);
+}
+
+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+	if (fn == bch2_btree_node_flush0 ||
+	    fn == bch2_btree_node_flush1)
+		return JOURNAL_PIN_btree;
+	else if (fn == bch2_btree_key_cache_journal_flush)
+		return JOURNAL_PIN_key_cache;
+	else
+		return JOURNAL_PIN_other;
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	struct journal_entry_pin_list *pin_list;
+	bool reclaim;
+
+	spin_lock(&j->lock);
+
+	if (seq < journal_last_seq(j)) {
+		/*
+		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+		 * the src pin - with the pin dropped, the entry to pin might no
+		 * longer to exist, but that means there's no longer anything to
+		 * copy and we can bail out here:
+		 */
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	pin_list = journal_seq_pin(j, seq);
+
+	reclaim = __journal_pin_drop(j, pin);
+
+	atomic_inc(&pin_list->count);
+	pin->seq	= seq;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
+	else
+		list_add(&pin->list, &pin_list->flushed);
+
+	if (reclaim)
+		bch2_journal_reclaim_fast(j);
+	spin_unlock(&j->lock);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+/**
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:		journal object
+ * @pin:	pin to flush
+ */
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+	BUG_ON(journal_pin_active(pin));
+
+	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j,
+		     u64 seq_to_flush,
+		     unsigned allowed_below_seq,
+		     unsigned allowed_above_seq,
+		     u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret = NULL;
+	unsigned i;
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+		if (*seq > seq_to_flush && !allowed_above_seq)
+			break;
+
+		for (i = 0; i < JOURNAL_PIN_NR; i++)
+			if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+			    ((1U << i) & allowed_above_seq)) {
+				ret = list_first_entry_or_null(&pin_list->list[i],
+					struct journal_entry_pin, list);
+				if (ret)
+					return ret;
+			}
+	}
+
+	return NULL;
+}
+
+/* returns true if we did work */
+static size_t journal_flush_pins(struct journal *j,
+				 u64 seq_to_flush,
+				 unsigned allowed_below_seq,
+				 unsigned allowed_above_seq,
+				 unsigned min_any,
+				 unsigned min_key_cache)
+{
+	struct journal_entry_pin *pin;
+	size_t nr_flushed = 0;
+	journal_pin_flush_fn flush_fn;
+	u64 seq;
+	int err;
+
+	lockdep_assert_held(&j->reclaim_lock);
+
+	while (1) {
+		unsigned allowed_above = allowed_above_seq;
+		unsigned allowed_below = allowed_below_seq;
+
+		if (min_any) {
+			allowed_above |= ~0;
+			allowed_below |= ~0;
+		}
+
+		if (min_key_cache) {
+			allowed_above |= 1U << JOURNAL_PIN_key_cache;
+			allowed_below |= 1U << JOURNAL_PIN_key_cache;
+		}
+
+		cond_resched();
+
+		j->last_flushed = jiffies;
+
+		spin_lock(&j->lock);
+		pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+		if (pin) {
+			BUG_ON(j->flush_in_progress);
+			j->flush_in_progress = pin;
+			j->flush_in_progress_dropped = false;
+			flush_fn = pin->flush;
+		}
+		spin_unlock(&j->lock);
+
+		if (!pin)
+			break;
+
+		if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+			min_key_cache--;
+
+		if (min_any)
+			min_any--;
+
+		err = flush_fn(j, pin, seq);
+
+		spin_lock(&j->lock);
+		/* Pin might have been dropped or rearmed: */
+		if (likely(!err && !j->flush_in_progress_dropped))
+			list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+		j->flush_in_progress = NULL;
+		j->flush_in_progress_dropped = false;
+		spin_unlock(&j->lock);
+
+		wake_up(&j->pin_flush_wait);
+
+		if (err)
+			break;
+
+		nr_flushed++;
+	}
+
+	return nr_flushed;
+}
+
+static u64 journal_seq_to_flush(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	u64 seq_to_flush = 0;
+	unsigned iter;
+
+	spin_lock(&j->lock);
+
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned nr_buckets, bucket_to_flush;
+
+		if (!ja->nr)
+			continue;
+
+		/* Try to keep the journal at most half full: */
+		nr_buckets = ja->nr / 2;
+
+		nr_buckets = min(nr_buckets, ja->nr);
+
+		bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
+		seq_to_flush = max(seq_to_flush,
+				   ja->bucket_seq[bucket_to_flush]);
+	}
+
+	/* Also flush if the pin fifo is more than half full */
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	return seq_to_flush;
+}
+
+/**
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:		journal object
+ * @direct:	direct or background reclaim?
+ * @kicked:	requested to run since we last ran?
+ * Returns:	0 on success, or -EIO if the journal has been shutdown
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	u64 seq_to_flush;
+	size_t min_nr, min_key_cache, nr_flushed;
+	unsigned flags;
+	int ret = 0;
+
+	/*
+	 * We can't invoke memory reclaim while holding the reclaim_lock -
+	 * journal reclaim is required to make progress for memory reclaim
+	 * (cleaning the caches), so we can't get stuck in memory reclaim while
+	 * we're holding the reclaim lock:
+	 */
+	lockdep_assert_held(&j->reclaim_lock);
+	flags = memalloc_noreclaim_save();
+
+	do {
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (bch2_journal_error(j)) {
+			ret = -EIO;
+			break;
+		}
+
+		bch2_journal_do_discards(j);
+
+		seq_to_flush = journal_seq_to_flush(j);
+		min_nr = 0;
+
+		/*
+		 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+		 * make sure to flush at least one journal pin:
+		 */
+		if (time_after(jiffies, j->last_flushed +
+			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
+			min_nr = 1;
+
+		if (j->watermark != BCH_WATERMARK_stripe)
+			min_nr = 1;
+
+		if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+			min_nr = 1;
+
+		min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+		trace_and_count(c, journal_reclaim_start, c,
+				direct, kicked,
+				min_nr, min_key_cache,
+				atomic_read(&c->btree_cache.dirty),
+				c->btree_cache.used,
+				atomic_long_read(&c->btree_key_cache.nr_dirty),
+				atomic_long_read(&c->btree_key_cache.nr_keys));
+
+		nr_flushed = journal_flush_pins(j, seq_to_flush,
+						~0, 0,
+						min_nr, min_key_cache);
+
+		if (direct)
+			j->nr_direct_reclaim += nr_flushed;
+		else
+			j->nr_background_reclaim += nr_flushed;
+		trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
+
+		if (nr_flushed)
+			wake_up(&j->reclaim_wait);
+	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
+
+	memalloc_noreclaim_restore(flags);
+
+	return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+	return __bch2_journal_reclaim(j, true, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+	struct journal *j = arg;
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	unsigned long delay, now;
+	bool journal_empty;
+	int ret = 0;
+
+	set_freezable();
+
+	j->last_flushed = jiffies;
+
+	while (!ret && !kthread_should_stop()) {
+		bool kicked = j->reclaim_kicked;
+
+		j->reclaim_kicked = false;
+
+		mutex_lock(&j->reclaim_lock);
+		ret = __bch2_journal_reclaim(j, false, kicked);
+		mutex_unlock(&j->reclaim_lock);
+
+		now = jiffies;
+		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
+		j->next_reclaim = j->last_flushed + delay;
+
+		if (!time_in_range(j->next_reclaim, now, now + delay))
+			j->next_reclaim = now + delay;
+
+		while (1) {
+			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+			if (kthread_should_stop())
+				break;
+			if (j->reclaim_kicked)
+				break;
+
+			spin_lock(&j->lock);
+			journal_empty = fifo_empty(&j->pin);
+			spin_unlock(&j->lock);
+
+			if (journal_empty)
+				schedule();
+			else if (time_after(j->next_reclaim, jiffies))
+				schedule_timeout(j->next_reclaim - jiffies);
+			else
+				break;
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return 0;
+}
+
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+	struct task_struct *p = j->reclaim_thread;
+
+	j->reclaim_thread = NULL;
+
+	if (p) {
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_journal_reclaim_start(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct task_struct *p;
+	int ret;
+
+	if (j->reclaim_thread)
+		return 0;
+
+	p = kthread_create(bch2_journal_reclaim_thread, j,
+			   "bch-reclaim/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err_msg(c, ret, "creating journal reclaim thread");
+		return ret;
+	}
+
+	get_task_struct(p);
+	j->reclaim_thread = p;
+	wake_up_process(p);
+	return 0;
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      bool *did_work)
+{
+	int ret;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&j->reclaim_lock);
+
+	if (journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_key_cache)|
+			       (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+	    journal_flush_pins(j, seq_to_flush,
+			       (1U << JOURNAL_PIN_btree), 0, 0, 0))
+		*did_work = true;
+
+	if (seq_to_flush > journal_cur_seq(j))
+		bch2_journal_entry_close(j);
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		!fifo_used(&j->pin);
+
+	spin_unlock(&j->lock);
+	mutex_unlock(&j->reclaim_lock);
+
+	return ret;
+}
+
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	bool did_work = false;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return false;
+
+	closure_wait_event(&j->async_wait,
+		journal_flush_done(j, seq_to_flush, &did_work));
+
+	return did_work;
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
+
+	/*
+	 * Now that we've populated replicas_gc, write to the journal to mark
+	 * active journal devices. This handles the case where the journal might
+	 * be empty. Otherwise we could clear all journal replicas and
+	 * temporarily put the fs into an unrecoverable state. Journal recovery
+	 * expects to find devices marked for journal data on unclean mount.
+	 */
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
+
+	seq = 0;
+	spin_lock(&j->lock);
+	while (!ret) {
+		struct bch_replicas_padded replicas;
+
+		seq = max(seq, journal_last_seq(j));
+		if (seq >= j->pin.back)
+			break;
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+					 journal_seq_pin(j, seq)->devs);
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, &replicas.e);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+err:
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
new file mode 100644
index 000000000000..494d1a6eddb0
--- /dev/null
+++ b/fs/bcachefs/journal_reclaim.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline void journal_reclaim_kick(struct journal *j)
+{
+	struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+	j->reclaim_kicked = true;
+	if (p)
+		wake_up_process(p);
+}
+
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+					    struct journal_device *,
+					    enum journal_space_from);
+void bch2_journal_space_available(struct journal *);
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->seq != 0;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
+					struct journal_entry_pin *pin,
+					journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+static inline void bch2_journal_pin_copy(struct journal *j,
+					 struct journal_entry_pin *dst,
+					 struct journal_entry_pin *src,
+					 journal_pin_flush_fn flush_fn)
+{
+	/* Guard against racing with journal_pin_drop(src): */
+	u64 seq = READ_ONCE(src->seq);
+
+	if (seq)
+		bch2_journal_pin_add(j, seq, dst, flush_fn);
+}
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+					   struct journal_entry_pin *pin,
+					   journal_pin_flush_fn flush_fn)
+{
+	if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+		bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
+
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_do_discards(struct journal *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
+
+bool bch2_journal_flush_pins(struct journal *, u64);
+
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
+{
+	return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
new file mode 100644
index 000000000000..ae4fb8c3a2bc
--- /dev/null
+++ b/fs/bcachefs/journal_sb.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+#include "darray.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+	const u64 *l = _l;
+	const u64 *r = _r;
+
+	return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	unsigned nr;
+	unsigned i;
+	u64 *b;
+
+	nr = bch2_nr_journal_buckets(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_validate;
+
+	for (i = 0; i < nr; i++)
+		b[i] = le64_to_cpu(journal->buckets[i]);
+
+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+	if (!b[0]) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0] < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0], le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1], le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (b[i] == b[i + 1]) {
+			prt_printf(err, "duplicate journal buckets %llu", b[i]);
+			goto err;
+		}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
+	unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+	.validate	= bch2_sb_journal_validate,
+	.to_text	= bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+	u64	start;
+	u64	end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+	const struct u64_range *l = _l;
+	const struct u64_range *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+	int ret = -BCH_ERR_invalid_sb_journal;
+	unsigned nr;
+	unsigned i;
+	struct u64_range *b;
+
+	nr = bch2_sb_field_journal_v2_nr_entries(journal);
+	if (!nr)
+		return 0;
+
+	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
+	if (!b)
+		return -BCH_ERR_ENOMEM_sb_journal_v2_validate;
+
+	for (i = 0; i < nr; i++) {
+		b[i].start = le64_to_cpu(journal->d[i].start);
+		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+	}
+
+	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+	if (!b[0].start) {
+		prt_printf(err, "journal bucket at sector 0");
+		goto err;
+	}
+
+	if (b[0].start < le16_to_cpu(m.first_bucket)) {
+		prt_printf(err, "journal bucket %llu before first bucket %u",
+		       b[0].start, le16_to_cpu(m.first_bucket));
+		goto err;
+	}
+
+	if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
+		prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+		       b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
+		goto err;
+	}
+
+	for (i = 0; i + 1 < nr; i++) {
+		if (b[i].end > b[i + 1].start) {
+			prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+			goto err;
+		}
+	}
+
+	ret = 0;
+err:
+	kfree(b);
+	return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				    struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+	prt_printf(out, "Buckets: ");
+	for (i = 0; i < nr; i++)
+		prt_printf(out, " %llu-%llu",
+		       le64_to_cpu(journal->d[i].start),
+		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+	.validate	= bch2_sb_journal_v2_validate,
+	.to_text	= bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+			       u64 *buckets, unsigned nr)
+{
+	struct bch_sb_field_journal_v2 *j;
+	unsigned i, dst = 0, nr_compacted = 1;
+
+	if (c)
+		lockdep_assert_held(&c->sb_lock);
+
+	if (!nr) {
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+		return 0;
+	}
+
+	for (i = 0; i + 1 < nr; i++)
+		if (buckets[i] + 1 != buckets[i + 1])
+			nr_compacted++;
+
+	j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
+			 (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
+	if (!j)
+		return -BCH_ERR_ENOSPC_sb_journal;
+
+	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+	j->d[dst].start = cpu_to_le64(buckets[0]);
+	j->d[dst].nr	= cpu_to_le64(1);
+
+	for (i = 1; i < nr; i++) {
+		if (buckets[i] == buckets[i - 1] + 1) {
+			le64_add_cpu(&j->d[dst].nr, 1);
+		} else {
+			dst++;
+			j->d[dst].start = cpu_to_le64(buckets[i]);
+			j->d[dst].nr	= cpu_to_le64(1);
+		}
+	}
+
+	BUG_ON(dst + 1 != nr_compacted);
+	return 0;
+}
diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
new file mode 100644
index 000000000000..ba40a7e8d90a
--- /dev/null
+++ b/fs/bcachefs/journal_sb.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+	return j
+		? (__le64 *) vstruct_end(&j->field) - j->buckets
+		: 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+	if (!j)
+		return 0;
+
+	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
new file mode 100644
index 000000000000..f9d9aa95bf3a
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "eytzinger.h"
+#include "journal_seq_blacklist.h"
+#include "super-io.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ */
+
+static unsigned sb_blacklist_u64s(unsigned nr)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+
+	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
+}
+
+static struct bch_sb_field_journal_seq_blacklist *
+blacklist_entry_try_merge(struct bch_fs *c,
+			  struct bch_sb_field_journal_seq_blacklist *bl,
+			  unsigned i)
+{
+	unsigned nr = blacklist_nr_entries(bl);
+
+	if (le64_to_cpu(bl->start[i].end) >=
+	    le64_to_cpu(bl->start[i + 1].start)) {
+		bl->start[i].end = bl->start[i + 1].end;
+		--nr;
+		memmove(&bl->start[i],
+			&bl->start[i + 1],
+			sizeof(bl->start[0]) * (nr - i));
+
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+					  sb_blacklist_u64s(nr));
+		BUG_ON(!bl);
+	}
+
+	return bl;
+}
+
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+					u64 start, u64 end)
+{
+	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	unsigned i, nr;
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	nr = blacklist_nr_entries(bl);
+
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e =
+			bl->start + i;
+
+		if (bl_entry_contig_or_overlaps(e, start, end)) {
+			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+			if (i + 1 < nr)
+				bl = blacklist_entry_try_merge(c,
+							bl, i);
+			if (i)
+				bl = blacklist_entry_try_merge(c,
+							bl, i - 1);
+			goto out_write_sb;
+		}
+	}
+
+	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				  sb_blacklist_u64s(nr + 1));
+	if (!bl) {
+		ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+		goto out;
+	}
+
+	bl->start[nr].start	= cpu_to_le64(start);
+	bl->start[nr].end	= cpu_to_le64(end);
+out_write_sb:
+	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+
+	ret = bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret ?: bch2_blacklist_table_initialize(c);
+}
+
+static int journal_seq_blacklist_table_cmp(const void *_l,
+					   const void *_r, size_t size)
+{
+	const struct journal_seq_blacklist_table_entry *l = _l;
+	const struct journal_seq_blacklist_table_entry *r = _r;
+
+	return cmp_int(l->start, r->start);
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
+				     bool dirty)
+{
+	struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+	struct journal_seq_blacklist_table_entry search = { .start = seq };
+	int idx;
+
+	if (!t)
+		return false;
+
+	idx = eytzinger0_find_le(t->entries, t->nr,
+				 sizeof(t->entries[0]),
+				 journal_seq_blacklist_table_cmp,
+				 &search);
+	if (idx < 0)
+		return false;
+
+	BUG_ON(t->entries[idx].start > seq);
+
+	if (seq >= t->entries[idx].end)
+		return false;
+
+	if (dirty)
+		t->entries[idx].dirty = true;
+	return true;
+}
+
+int bch2_blacklist_table_initialize(struct bch_fs *c)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	struct journal_seq_blacklist_table *t;
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	if (!bl)
+		return 0;
+
+	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
+		    GFP_KERNEL);
+	if (!t)
+		return -BCH_ERR_ENOMEM_blacklist_table_init;
+
+	t->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		t->entries[i].start	= le64_to_cpu(bl->start[i].start);
+		t->entries[i].end	= le64_to_cpu(bl->start[i].end);
+	}
+
+	eytzinger0_sort(t->entries,
+			t->nr,
+			sizeof(t->entries[0]),
+			journal_seq_blacklist_table_cmp,
+			NULL);
+
+	kfree(c->journal_seq_blacklist_table);
+	c->journal_seq_blacklist_table = t;
+	return 0;
+}
+
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+						  struct bch_sb_field *f,
+						  struct printbuf *err)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	unsigned i, nr = blacklist_nr_entries(bl);
+
+	for (i = 0; i < nr; i++) {
+		struct journal_seq_blacklist_entry *e = bl->start + i;
+
+		if (le64_to_cpu(e->start) >=
+		    le64_to_cpu(e->end)) {
+			prt_printf(err, "entry %u start >= end (%llu >= %llu)",
+			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
+
+		if (i + 1 < nr &&
+		    le64_to_cpu(e[0].end) >
+		    le64_to_cpu(e[1].start)) {
+			prt_printf(err, "entry %u out of order with next entry (%llu > %llu)",
+			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+			return -BCH_ERR_invalid_sb_journal_seq_blacklist;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
+						  struct bch_sb *sb,
+						  struct bch_sb_field *f)
+{
+	struct bch_sb_field_journal_seq_blacklist *bl =
+		field_to_type(f, journal_seq_blacklist);
+	struct journal_seq_blacklist_entry *i;
+	unsigned nr = blacklist_nr_entries(bl);
+
+	for (i = bl->start; i < bl->start + nr; i++) {
+		if (i != bl->start)
+			prt_printf(out, " ");
+
+		prt_printf(out, "%llu-%llu",
+		       le64_to_cpu(i->start),
+		       le64_to_cpu(i->end));
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
+	.validate	= bch2_sb_journal_seq_blacklist_validate,
+	.to_text	= bch2_sb_journal_seq_blacklist_to_text
+};
+
+void bch2_blacklist_entries_gc(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+					journal_seq_blacklist_gc_work);
+	struct journal_seq_blacklist_table *t;
+	struct bch_sb_field_journal_seq_blacklist *bl;
+	struct journal_seq_blacklist_entry *src, *dst;
+	struct btree_trans *trans = bch2_trans_get(c);
+	unsigned i, nr, new_nr;
+	int ret;
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
+					  0, 0, BTREE_ITER_PREFETCH);
+retry:
+		bch2_trans_begin(trans);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+		       b &&
+		       !test_bit(BCH_FS_STOPPING, &c->flags))
+			b = bch2_btree_iter_next_node(&iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	bch2_trans_put(trans);
+	if (ret)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
+	if (!bl)
+		goto out;
+
+	nr = blacklist_nr_entries(bl);
+	dst = bl->start;
+
+	t = c->journal_seq_blacklist_table;
+	BUG_ON(nr != t->nr);
+
+	for (src = bl->start, i = eytzinger0_first(t->nr);
+	     src < bl->start + nr;
+	     src++, i = eytzinger0_next(i, nr)) {
+		BUG_ON(t->entries[i].start	!= le64_to_cpu(src->start));
+		BUG_ON(t->entries[i].end	!= le64_to_cpu(src->end));
+
+		if (t->entries[i].dirty)
+			*dst++ = *src;
+	}
+
+	new_nr = dst - bl->start;
+
+	bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
+
+	if (new_nr != nr) {
+		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+				new_nr ? sb_blacklist_u64s(new_nr) : 0);
+		BUG_ON(new_nr && !bl);
+
+		if (!new_nr)
+			c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
+
+		bch2_write_super(c);
+	}
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
new file mode 100644
index 000000000000..afb886ec8e25
--- /dev/null
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+static inline unsigned
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
+{
+	return bl
+		? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
+		   sizeof(struct journal_seq_blacklist_entry))
+		: 0;
+}
+
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
+int bch2_blacklist_table_initialize(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
+
+void bch2_blacklist_entries_gc(struct work_struct *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
new file mode 100644
index 000000000000..a756b69582e3
--- /dev/null
+++ b/fs/bcachefs/journal_types.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+#define JOURNAL_BUF_BITS	2
+#define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
+
+/*
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+
+	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
+	struct bch_devs_list	devs_written;
+
+	struct closure_waitlist	wait;
+	u64			last_seq;	/* copy of data->last_seq */
+	long			expires;
+	u64			flush_time;
+
+	unsigned		buf_size;	/* size in bytes of @data */
+	unsigned		sectors;	/* maximum size for current entry */
+	unsigned		disk_sectors;	/* maximum size entry could have been, if
+						   buf_size was bigger */
+	unsigned		u64s_reserved;
+	bool			noflush;	/* write has already been kicked off, and was noflush */
+	bool			must_flush;	/* something wants a flush */
+	bool			separate_flush;
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+enum journal_pin_type {
+	JOURNAL_PIN_btree,
+	JOURNAL_PIN_key_cache,
+	JOURNAL_PIN_other,
+	JOURNAL_PIN_NR,
+};
+
+struct journal_entry_pin_list {
+	struct list_head		list[JOURNAL_PIN_NR];
+	struct list_head		flushed;
+	atomic_t			count;
+	struct bch_devs_list		devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef int (*journal_pin_flush_fn)(struct journal *j,
+				struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	u64				seq;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:2,
+				unwritten_idx:2,
+				buf0_count:10,
+				buf1_count:10,
+				buf2_count:10,
+				buf3_count:10;
+	};
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+struct journal_space {
+	/* Units of 512 bytes sectors: */
+	unsigned	next_entry; /* How big the next journal entry can be */
+	unsigned	total;
+};
+
+enum journal_space_from {
+	journal_space_discarded,
+	journal_space_clean_ondisk,
+	journal_space_clean,
+	journal_space_total,
+	journal_space_nr,
+};
+
+enum journal_flags {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_MAY_SKIP_FLUSH,
+	JOURNAL_NEED_FLUSH_WRITE,
+};
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()		\
+	x(ok)				\
+	x(blocked)			\
+	x(max_in_flight)		\
+	x(journal_full)			\
+	x(journal_pin_full)		\
+	x(journal_stuck)		\
+	x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)	JOURNAL_ERR_##n,
+	JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64)		darray_u64;
+
+/* Embedded in struct bch_fs */
+struct journal {
+	/* Fastpath stuff up front: */
+	struct {
+
+	union journal_res_state reservations;
+	enum bch_watermark	watermark;
+
+	} __aligned(SMP_CACHE_BYTES);
+
+	unsigned long		flags;
+
+	/* Max size of current journal entry */
+	unsigned		cur_entry_u64s;
+	unsigned		cur_entry_sectors;
+
+	/* Reserved space in journal entry to be used just prior to write */
+	unsigned		entry_u64s_reserved;
+
+
+	/*
+	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+	 * insufficient devices:
+	 */
+	enum journal_errors	cur_entry_error;
+
+	unsigned		buf_size_want;
+	/*
+	 * We may queue up some things to be journalled (log messages) before
+	 * the journal has actually started - stash them here:
+	 */
+	darray_u64		early_journal_entries;
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[JOURNAL_BUF_NR];
+
+	spinlock_t		lock;
+
+	/* if nonzero, we may not open a new journal entry: */
+	unsigned		blocked;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+	struct closure_waitlist	async_wait;
+	struct closure_waitlist	preres_wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* seq, last_seq from the most recent journal entry successfully written */
+	u64			seq_ondisk;
+	u64			flushed_seq_ondisk;
+	u64			last_seq_ondisk;
+	u64			err_seq;
+	u64			last_empty_seq;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	struct {
+		u64 front, back, size, mask;
+		struct journal_entry_pin_list *data;
+	}			pin;
+
+	struct journal_space	space[journal_space_nr];
+
+	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
+
+	struct write_point	wp;
+	spinlock_t		err_lock;
+
+	struct mutex		reclaim_lock;
+	/*
+	 * Used for waiting until journal reclaim has freed up space in the
+	 * journal:
+	 */
+	wait_queue_head_t	reclaim_wait;
+	struct task_struct	*reclaim_thread;
+	bool			reclaim_kicked;
+	unsigned long		next_reclaim;
+	u64			nr_direct_reclaim;
+	u64			nr_background_reclaim;
+
+	unsigned long		last_flushed;
+	struct journal_entry_pin *flush_in_progress;
+	bool			flush_in_progress_dropped;
+	wait_queue_head_t	pin_flush_wait;
+
+	/* protects advancing ja->discard_idx: */
+	struct mutex		discard_lock;
+	bool			can_discard;
+
+	unsigned long		last_flush_write;
+
+	u64			res_get_blocked_start;
+	u64			write_start_time;
+
+	u64			nr_flush_writes;
+	u64			nr_noflush_writes;
+
+	struct bch2_time_stats	*flush_write_time;
+	struct bch2_time_stats	*noflush_write_time;
+	struct bch2_time_stats	*blocked_time;
+	struct bch2_time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+} __aligned(SMP_CACHE_BYTES);
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/*
+	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
+	 */
+	unsigned		discard_idx;		/* Next bucket to discard */
+	unsigned		dirty_idx_ondisk;
+	unsigned		dirty_idx;
+	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
+	unsigned		nr;
+
+	u64			*buckets;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+	unsigned		u64s;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
new file mode 100644
index 000000000000..5699cd4873c8
--- /dev/null
+++ b/fs/bcachefs/keylist.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch2_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch2_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bpos_ge(k->k.p, bkey_next(k)->k.p));
+}
+#endif
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
new file mode 100644
index 000000000000..fe759c7031e0
--- /dev/null
+++ b/fs/bcachefs/keylist.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch2_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+	return bch2_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
new file mode 100644
index 000000000000..4b3ff7d8a875
--- /dev/null
+++ b/fs/bcachefs/keylist_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
new file mode 100644
index 000000000000..8640f7dee0de
--- /dev/null
+++ b/fs/bcachefs/logged_ops.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+	u8		type;
+	int		(*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)		{					\
+	.type		= KEY_TYPE_logged_op_##n,		\
+	.resume		= bch2_resume_logged_op_##n,		\
+},
+	BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+	for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+		if (logged_op_fns[i].type == type)
+			return logged_op_fns + i;
+	return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+			    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret;
+
+	if (!fn)
+		return 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+
+	ret =   drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
+		fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter,
+				BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+			resume_logged_op(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+	if (ret)
+		return ret;
+
+	k->k.p = iter.pos;
+
+	ret = bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+	return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			 __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+	int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+	/*
+	 * This needs to be a fatal error because we've left an unfinished
+	 * operation in the logged ops btree.
+	 *
+	 * We should only ever see an error here if the filesystem has already
+	 * been shut down, but make sure of that here:
+	 */
+	if (ret) {
+		struct bch_fs *c = trans->c;
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+				     __func__, buf.buf, bch2_err_str(ret));
+		printbuf_exit(&buf);
+	}
+}
diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h
new file mode 100644
index 000000000000..4d1e786a27a8
--- /dev/null
+++ b/fs/bcachefs/logged_ops.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()			\
+	x(truncate)				\
+	x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
new file mode 100644
index 000000000000..a5cc0ed195d6
--- /dev/null
+++ b/fs/bcachefs/lru.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+/* KEY_TYPE_lru is obsolete: */
+int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
+		     enum bkey_invalid_flags flags,
+		     struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
+			 lru_entry_at_time_0,
+			 "lru entry at time=0");
+fsck_err:
+	return ret;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+		      struct bkey_s_c k)
+{
+	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+	prt_printf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru)
+{
+	prt_printf(out, "%llu:%llu -> %llu:%llu",
+		   lru_pos_id(lru),
+		   lru_pos_time(lru),
+		   u64_to_bucket(lru.offset).inode,
+		   u64_to_bucket(lru.offset).offset);
+}
+
+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
+			  u64 dev_bucket, u64 time, bool set)
+{
+	return time
+		? bch2_btree_bit_mod(trans, BTREE_ID_lru,
+				     lru_pos(lru_id, dev_bucket, time), set)
+		: 0;
+}
+
+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted);
+}
+
+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time)
+{
+	return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
+}
+
+int bch2_lru_change(struct btree_trans *trans,
+		    u16 lru_id, u64 dev_bucket,
+		    u64 old_time, u64 new_time)
+{
+	if (old_time == new_time)
+		return 0;
+
+	return  bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?:
+		bch2_lru_set(trans, lru_id, dev_bucket, new_time);
+}
+
+static const char * const bch2_lru_types[] = {
+#define x(n) #n,
+	BCH_LRU_TYPES()
+#undef x
+	NULL
+};
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+			      struct btree_iter *lru_iter,
+			      struct bkey_s_c lru_k,
+			      struct bpos *last_flushed_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	enum bch_lru_type type = lru_type(lru_k);
+	struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
+	u64 idx;
+	int ret;
+
+	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+			lru_entry_to_invalid_bucket,
+			"lru key points to nonexistent device:bucket %llu:%llu",
+			alloc_pos.inode, alloc_pos.offset))
+		return bch2_btree_delete_at(trans, lru_iter, 0);
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+
+	switch (type) {
+	case BCH_LRU_read:
+		idx = alloc_lru_idx_read(*a);
+		break;
+	case BCH_LRU_fragmentation:
+		idx = a->fragmentation_lru;
+		break;
+	}
+
+	if (lru_k.k->type != KEY_TYPE_set ||
+	    lru_pos_time(lru_k.k->p) != idx) {
+		if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
+			*last_flushed_pos = lru_k.k->p;
+			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+				-BCH_ERR_transaction_restart_write_buffer_flush;
+			goto out;
+		}
+
+		if (c->opts.reconstruct_alloc ||
+		    fsck_err(c, lru_entry_bad,
+			     "incorrect lru entry: lru %s time %llu\n"
+			     "  %s\n"
+			     "  for %s",
+			     bch2_lru_types[type],
+			     lru_pos_time(lru_k.k->p),
+			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
+			ret = bch2_btree_delete_at(trans, lru_iter, 0);
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &iter);
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bpos last_flushed_pos = POS_MIN;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+
+}
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
new file mode 100644
index 000000000000..429dca816df5
--- /dev/null
+++ b/fs/bcachefs/lru.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+#define LRU_TIME_BITS	48
+#define LRU_TIME_MAX	((1ULL << LRU_TIME_BITS) - 1)
+
+static inline u64 lru_pos_id(struct bpos pos)
+{
+	return pos.inode >> LRU_TIME_BITS;
+}
+
+static inline u64 lru_pos_time(struct bpos pos)
+{
+	return pos.inode & ~(~0ULL << LRU_TIME_BITS);
+}
+
+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
+{
+	struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket);
+
+	EBUG_ON(time > LRU_TIME_MAX);
+	EBUG_ON(lru_pos_id(pos) != lru_id);
+	EBUG_ON(lru_pos_time(pos) != time);
+	EBUG_ON(pos.offset != dev_bucket);
+
+	return pos;
+}
+
+#define BCH_LRU_TYPES()		\
+	x(read)			\
+	x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+	BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START	((1U << 16) - 1)
+
+static inline enum bch_lru_type lru_type(struct bkey_s_c l)
+{
+	u16 lru_id = l.k->p.inode >> 48;
+
+	if (lru_id == BCH_LRU_FRAGMENTATION_START)
+		return BCH_LRU_fragmentation;
+	return BCH_LRU_read;
+}
+
+int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
+		     enum bkey_invalid_flags, struct printbuf *);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
+
+#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
+	.key_invalid	= bch2_lru_invalid,	\
+	.val_to_text	= bch2_lru_to_text,	\
+	.min_val_size	= 8,			\
+})
+
+int bch2_lru_del(struct btree_trans *, u16, u64, u64);
+int bch2_lru_set(struct btree_trans *, u16, u64, u64);
+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+int bch2_check_lrus(struct bch_fs *);
+
+#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
new file mode 100644
index 000000000000..1f0801e2e565
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions for incremental mean and variance.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Copyright © 2022 Daniel B. Hill
+ *
+ * Author: Daniel B. Hill <daniel@gluo.nz>
+ *
+ * Description:
+ *
+ * This is includes some incremental algorithms for mean and variance calculation
+ *
+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ *
+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
+ *
+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
+ *
+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
+ * is deferred to these functions for performance reasons.
+ *
+ * see lib/math/mean_and_variance_test.c for examples of usage.
+ *
+ * DO NOT access the mean and variance fields of the weighted variants directly.
+ * DO NOT change the weight after calling update.
+ */
+
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+
+#include "mean_and_variance.h"
+
+u128_u u128_div(u128_u n, u64 d)
+{
+	u128_u r;
+	u64 rem;
+	u64 hi = u128_hi(n);
+	u64 lo = u128_lo(n);
+	u64  h =  hi & ((u64) U32_MAX  << 32);
+	u64  l = (hi &  (u64) U32_MAX) << 32;
+
+	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+	return r;
+}
+EXPORT_SYMBOL_GPL(u128_div);
+
+/**
+ * mean_and_variance_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_get_mean(struct mean_and_variance s)
+{
+	return s.n ? div64_u64(s.sum, s.n) : 0;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
+
+/**
+ * mean_and_variance_get_variance() -  get variance from @s1
+ *
+ * see linked pdf equation 12.
+ */
+u64 mean_and_variance_get_variance(struct mean_and_variance s1)
+{
+	if (s1.n) {
+		u128_u s2 = u128_div(s1.sum_squares, s1.n);
+		u64  s3 = abs(mean_and_variance_get_mean(s1));
+
+		return u128_lo(u128_sub(s2, u128_square(s3)));
+	} else {
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
+
+/**
+ * mean_and_variance_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_get_stddev(struct mean_and_variance s)
+{
+	return int_sqrt64(mean_and_variance_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
+
+/**
+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
+ * @s1: ..
+ * @s2: ..
+ *
+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
+ * values are stored bitshifted for performance and added precision.
+ */
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+{
+	// previous weighted variance.
+	u8 w		= s->weight;
+	u64 var_w0	= s->variance;
+	// new value weighted.
+	s64 x_w		= x << w;
+	s64 diff_w	= x_w - s->mean;
+	s64 diff	= fast_divpow2(diff_w, w);
+	// new mean weighted.
+	s64 u_w1	= s->mean + diff;
+
+	if (!s->init) {
+		s->mean = x_w;
+		s->variance = 0;
+	} else {
+		s->mean = u_w1;
+		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+	}
+	s->init = true;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
+
+/**
+ * mean_and_variance_weighted_get_mean() - get mean from @s
+ */
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+{
+	return fast_divpow2(s.mean, s.weight);
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
+
+/**
+ * mean_and_variance_weighted_get_variance() -- get variance from @s
+ */
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+{
+	// always positive don't need fast divpow2
+	return s.variance >> s.weight;
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
+
+/**
+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ */
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+{
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+}
+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
new file mode 100644
index 000000000000..647505010b39
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance.h
@@ -0,0 +1,198 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef MEAN_AND_VARIANCE_H_
+#define MEAN_AND_VARIANCE_H_
+
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/math.h>
+#include <linux/math64.h>
+
+#define SQRT_U64_MAX 4294967295ULL
+
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
+ */
+
+#ifdef __SIZEOF_INT128__
+
+typedef struct {
+	unsigned __int128 v;
+} __aligned(16) u128_u;
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .v = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.v;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.v >> 64;
+}
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	a.v += b.v;
+	return a;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	a.v -= b.v;
+	return a;
+}
+
+static inline u128_u u128_shl(u128_u a, s8 shift)
+{
+	a.v <<= shift;
+	return a;
+}
+
+static inline u128_u u128_square(u64 a)
+{
+	u128_u b = u64_to_u128(a);
+
+	b.v *= b.v;
+	return b;
+}
+
+#else
+
+typedef struct {
+	u64 hi, lo;
+} __aligned(16) u128_u;
+
+/* conversions */
+
+static inline u128_u u64_to_u128(u64 a)
+{
+	return (u128_u) { .lo = a };
+}
+
+static inline u64 u128_lo(u128_u a)
+{
+	return a.lo;
+}
+
+static inline u64 u128_hi(u128_u a)
+{
+	return a.hi;
+}
+
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo + b.lo;
+	c.hi = a.hi + b.hi + (c.lo < a.lo);
+	return c;
+}
+
+static inline u128_u u128_sub(u128_u a, u128_u b)
+{
+	u128_u c;
+
+	c.lo = a.lo - b.lo;
+	c.hi = a.hi - b.hi - (c.lo > a.lo);
+	return c;
+}
+
+static inline u128_u u128_shl(u128_u i, s8 shift)
+{
+	u128_u r;
+
+	r.lo = i.lo << shift;
+	if (shift < 64)
+		r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
+	else {
+		r.hi = i.lo << (shift - 64);
+		r.lo = 0;
+	}
+	return r;
+}
+
+static inline u128_u u128_square(u64 i)
+{
+	u128_u r;
+	u64  h = i >> 32, l = i & U32_MAX;
+
+	r =             u128_shl(u64_to_u128(h*h), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
+	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
+	r = u128_add(r,          u64_to_u128(l*l));
+	return r;
+}
+
+#endif
+
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
+{
+	u128_u c = u64_to_u128(hi);
+
+	c = u128_shl(c, 64);
+	c = u128_add(c, u64_to_u128(lo));
+	return c;
+}
+
+u128_u u128_div(u128_u n, u64 d);
+
+struct mean_and_variance {
+	s64	n;
+	s64	sum;
+	u128_u	sum_squares;
+};
+
+/* expontentially weighted variant */
+struct mean_and_variance_weighted {
+	bool	init;
+	u8	weight;	/* base 2 logarithim */
+	s64	mean;
+	u64	variance;
+};
+
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
+
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
+static inline void
+mean_and_variance_update(struct mean_and_variance *s, s64 v)
+{
+	s->n++;
+	s->sum += v;
+	s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v)));
+}
+
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+
+#endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
new file mode 100644
index 000000000000..019583c3ca0e
--- /dev/null
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "mean_and_variance.h"
+
+#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
+
+static void mean_and_variance_basic_test(struct kunit *test)
+{
+	struct mean_and_variance s = {};
+
+	mean_and_variance_update(&s, 2);
+	mean_and_variance_update(&s, 2);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
+	KUNIT_EXPECT_EQ(test, s.n, 2);
+
+	mean_and_variance_update(&s, 4);
+	mean_and_variance_update(&s, 4);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
+	KUNIT_EXPECT_EQ(test, s.n, 4);
+}
+
+/*
+ * Test values computed using a spreadsheet from the psuedocode at the bottom:
+ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
+ */
+
+static void mean_and_variance_weighted_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 2 };
+
+	mean_and_variance_weighted_update(&s, 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, 20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, 30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+
+	s = (struct mean_and_variance_weighted) { .weight = 2 };
+
+	mean_and_variance_weighted_update(&s, -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+
+	mean_and_variance_weighted_update(&s, -20);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+
+	mean_and_variance_weighted_update(&s, -30);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+}
+
+static void mean_and_variance_weighted_advanced_test(struct kunit *test)
+{
+	struct mean_and_variance_weighted s = { .weight = 8 };
+	s64 i;
+
+	for (i = 10; i <= 100; i += 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+
+	s = (struct mean_and_variance_weighted) { .weight = 8 };
+
+	for (i = -10; i >= -100; i -= 10)
+		mean_and_variance_weighted_update(&s, i);
+
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+}
+
+static void do_mean_and_variance_test(struct kunit *test,
+				      s64 initial_value,
+				      s64 initial_n,
+				      s64 n,
+				      unsigned weight,
+				      s64 *data,
+				      s64 *mean,
+				      s64 *stddev,
+				      s64 *weighted_mean,
+				      s64 *weighted_stddev)
+{
+	struct mean_and_variance mv = {};
+	struct mean_and_variance_weighted vw = { .weight = weight };
+
+	for (unsigned i = 0; i < initial_n; i++) {
+		mean_and_variance_update(&mv, initial_value);
+		mean_and_variance_weighted_update(&vw, initial_value);
+
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+	}
+
+	for (unsigned i = 0; i < n; i++) {
+		mean_and_variance_update(&mv, data[i]);
+		mean_and_variance_weighted_update(&vw, data[i]);
+
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	weighted_mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+	}
+
+	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
+}
+
+/* Test behaviour with a single outlier, then back to steady state: */
+static void mean_and_variance_test_1(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  22, 21, 20, 19, 18, 17, 16 };
+	s64 stddev[]		= {  32, 29, 28, 27, 26, 25, 24 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_2(struct kunit *test)
+{
+	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
+	s64 mean[]		= {  10, 10, 10, 10, 10, 10, 10 };
+	s64 stddev[]		= {   9,  9,  9,  9,  9,  9,  9 };
+	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
+	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+/* Test behaviour where we switch from one steady state to another: */
+static void mean_and_variance_test_3(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  22,  32,  40,  46,  50 };
+	s64 stddev[]		= {  32,  39,  42,  44,  45 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_test_4(struct kunit *test)
+{
+	s64 d[]			= { 100, 100, 100, 100, 100 };
+	s64 mean[]		= {  10,  11,  12,  13,  14 };
+	s64 stddev[]		= {   9,  13,  15,  17,  19 };
+	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
+	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
+
+	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
+			d, mean, stddev, weighted_mean, weighted_stddev);
+}
+
+static void mean_and_variance_fast_divpow2(struct kunit *test)
+{
+	s64 i;
+	u8 d;
+
+	for (i = 0; i < 100; i++) {
+		d = 0;
+		KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
+		KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
+		for (d = 1; d < 32; d++) {
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
+					    div_u64(i, 1 << d), "%lld %u", i, d);
+			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
+					    div_u64(i, 1 << d), "%lld %u", -i, d);
+		}
+	}
+}
+
+static void mean_and_variance_u128_basic_test(struct kunit *test)
+{
+	u128_u a  = u64s_to_u128(0, U64_MAX);
+	u128_u a1 = u64s_to_u128(0, 1);
+	u128_u b  = u64s_to_u128(1, 0);
+	u128_u c  = u64s_to_u128(0, 1LLU << 63);
+	u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX);
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1);
+
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX);
+
+	KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
+	KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
+}
+
+static struct kunit_case mean_and_variance_test_cases[] = {
+	KUNIT_CASE(mean_and_variance_fast_divpow2),
+	KUNIT_CASE(mean_and_variance_u128_basic_test),
+	KUNIT_CASE(mean_and_variance_basic_test),
+	KUNIT_CASE(mean_and_variance_weighted_test),
+	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
+	KUNIT_CASE(mean_and_variance_test_1),
+	KUNIT_CASE(mean_and_variance_test_2),
+	KUNIT_CASE(mean_and_variance_test_3),
+	KUNIT_CASE(mean_and_variance_test_4),
+	{}
+};
+
+static struct kunit_suite mean_and_variance_test_suite = {
+	.name		= "mean and variance tests",
+	.test_cases	= mean_and_variance_test_cases
+};
+
+kunit_test_suite(mean_and_variance_test_suite);
+
+MODULE_AUTHOR("Daniel B. Hill");
+MODULE_LICENSE("GPL");
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
new file mode 100644
index 000000000000..e3a51f6d6c9b
--- /dev/null
+++ b/fs/bcachefs/migrate.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "extents.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
+
+	bch2_bkey_drop_device(k, dev_idx);
+
+	nr_good = bch2_bkey_durability(c, k.s_c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
+				     struct btree_iter *iter,
+				     struct bkey_s_c k,
+				     unsigned dev_idx,
+				     int flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i *n;
+	int ret;
+
+	if (!bch2_bkey_has_device_c(k, dev_idx))
+		return 0;
+
+	n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * If the new extent no longer has any pointers, bch2_extent_normalize()
+	 * will do the appropriate thing with it (turning it into a
+	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
+	 */
+	bch2_extent_normalize(c, bkey_i_to_s(n));
+
+	/*
+	 * Since we're not inserting through an extent iterator
+	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+	 * we aren't using the extent overwrite path to delete, we're
+	 * just using the normal key deletion path:
+	 */
+	if (bkey_deleted(&n->k))
+		n->k.size = 0;
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id id;
+	int ret = 0;
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
+			continue;
+
+		ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+
+	return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	struct bkey_buf k;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	trans = bch2_trans_get(c);
+	bch2_bkey_buf_init(&k);
+	closure_init_stack(&cl);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+retry:
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
+				goto next;
+
+			bch2_bkey_buf_copy(&k, c, &b->key);
+
+			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
+					    dev_idx, flags, true);
+			if (ret) {
+				bch_err(c, "Cannot drop device without losing data");
+				break;
+			}
+
+			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+				ret = 0;
+				continue;
+			}
+
+			if (ret) {
+				bch_err_msg(c, ret, "updating btree node key");
+				break;
+			}
+next:
+			bch2_btree_iter_next_node(&iter);
+		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (ret)
+			goto err;
+	}
+
+	bch2_btree_interior_updates_flush(c);
+	ret = 0;
+err:
+	bch2_bkey_buf_exit(&k, c);
+	bch2_trans_put(trans);
+
+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+	return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
new file mode 100644
index 000000000000..027efaa0d575
--- /dev/null
+++ b/fs/bcachefs/migrate.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
new file mode 100644
index 000000000000..54830ee0ed88
--- /dev/null
+++ b/fs/bcachefs/move.c
@@ -0,0 +1,1154 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "snapshot.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+{
+	if (trace_move_extent_read_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		trace_move_extent_read(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+}
+
+struct moving_io {
+	struct list_head		read_list;
+	struct list_head		io_list;
+	struct move_bucket_in_flight	*b;
+	struct closure			cl;
+	bool				read_completed;
+
+	unsigned			read_sectors;
+	unsigned			write_sectors;
+
+	struct bch_read_bio		rbio;
+
+	struct data_update		write;
+	/* Must be last since it is variable size */
+	struct bio_vec			bi_inline_vecs[0];
+};
+
+static void move_free(struct moving_io *io)
+{
+	struct moving_context *ctxt = io->write.ctxt;
+
+	if (io->b)
+		atomic_dec(&io->b->count);
+
+	bch2_data_update_exit(&io->write);
+
+	mutex_lock(&ctxt->lock);
+	list_del(&io->io_list);
+	wake_up(&ctxt->wait);
+	mutex_unlock(&ctxt->lock);
+
+	kfree(io);
+}
+
+static void move_write_done(struct bch_write_op *op)
+{
+	struct moving_io *io = container_of(op, struct moving_io, write.op);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	if (io->write.op.error)
+		ctxt->write_error = true;
+
+	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_dec(&io->write.ctxt->write_ios);
+	move_free(io);
+	closure_put(&ctxt->cl);
+}
+
+static void move_write(struct moving_io *io)
+{
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		move_free(io);
+		return;
+	}
+
+	closure_get(&io->write.ctxt->cl);
+	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	atomic_inc(&io->write.ctxt->write_ios);
+
+	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
+}
+
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	atomic_dec(&ctxt->read_ios);
+	io->read_completed = true;
+
+	wake_up(&ctxt->wait);
+	closure_put(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+		bch2_trans_unlock_long(ctxt->trans);
+		list_del(&io->read_list);
+		move_write(io);
+	}
+}
+
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->write_sectors) ||
+		atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+	bch2_trans_unlock_long(ctxt->trans);
+	closure_sync(&ctxt->cl);
+}
+
+void bch2_moving_ctxt_exit(struct moving_context *ctxt)
+{
+	struct bch_fs *c = ctxt->trans->c;
+
+	bch2_moving_ctxt_flush_all(ctxt);
+
+	EBUG_ON(atomic_read(&ctxt->write_sectors));
+	EBUG_ON(atomic_read(&ctxt->write_ios));
+	EBUG_ON(atomic_read(&ctxt->read_sectors));
+	EBUG_ON(atomic_read(&ctxt->read_ios));
+
+	mutex_lock(&c->moving_context_lock);
+	list_del(&ctxt->list);
+	mutex_unlock(&c->moving_context_lock);
+
+	bch2_trans_put(ctxt->trans);
+	memset(ctxt, 0, sizeof(*ctxt));
+}
+
+void bch2_moving_ctxt_init(struct moving_context *ctxt,
+			   struct bch_fs *c,
+			   struct bch_ratelimit *rate,
+			   struct bch_move_stats *stats,
+			   struct write_point_specifier wp,
+			   bool wait_on_copygc)
+{
+	memset(ctxt, 0, sizeof(*ctxt));
+
+	ctxt->trans	= bch2_trans_get(c);
+	ctxt->fn	= (void *) _RET_IP_;
+	ctxt->rate	= rate;
+	ctxt->stats	= stats;
+	ctxt->wp	= wp;
+	ctxt->wait_on_copygc = wait_on_copygc;
+
+	closure_init_stack(&ctxt->cl);
+
+	mutex_init(&ctxt->lock);
+	INIT_LIST_HEAD(&ctxt->reads);
+	INIT_LIST_HEAD(&ctxt->ios);
+	init_waitqueue_head(&ctxt->wait);
+
+	mutex_lock(&c->moving_context_lock);
+	list_add(&ctxt->list, &c->moving_context_list);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+	trace_move_data(c, stats);
+}
+
+void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->data_type = BCH_DATA_user;
+	scnprintf(stats->name, sizeof(stats->name), "%s", name);
+}
+
+int bch2_move_extent(struct moving_context *ctxt,
+		     struct move_bucket_in_flight *bucket_in_flight,
+		     struct btree_iter *iter,
+		     struct bkey_s_c k,
+		     struct bch_io_opts io_opts,
+		     struct data_update_opts data_opts)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	struct moving_io *io;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned sectors = k.k->size, pages;
+	int ret = -ENOMEM;
+
+	if (ctxt->stats)
+		ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
+	trace_move_extent2(c, k);
+
+	bch2_data_update_opts_normalize(k, &data_opts);
+
+	if (!data_opts.rewrite_ptrs &&
+	    !data_opts.extra_replicas) {
+		if (data_opts.kill_ptrs)
+			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+		return 0;
+	}
+
+	/*
+	 * Before memory allocations & taking nocow locks in
+	 * bch2_data_update_init():
+	 */
+	bch2_trans_unlock(trans);
+
+	/* write path might have to decompress data: */
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
+
+	INIT_LIST_HEAD(&io->io_list);
+	io->write.ctxt		= ctxt;
+	io->read_sectors	= k.k->size;
+	io->write_sectors	= k.k->size;
+
+	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
+		goto err_free;
+
+	io->rbio.c		= c;
+	io->rbio.opts		= io_opts;
+	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	io->rbio.bio.bi_vcnt = pages;
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+	io->rbio.bio.bi_opf		= REQ_OP_READ;
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
+
+	ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+				    io_opts, data_opts, iter->btree_id, k);
+	if (ret)
+		goto err_free_pages;
+
+	io->write.op.end_io = move_write_done;
+
+	if (ctxt->rate)
+		bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
+	if (ctxt->stats) {
+		atomic64_inc(&ctxt->stats->keys_moved);
+		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+	}
+
+	if (bucket_in_flight) {
+		io->b = bucket_in_flight;
+		atomic_inc(&io->b->count);
+	}
+
+	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
+	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+	trace_move_extent_read2(c, k);
+
+	mutex_lock(&ctxt->lock);
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	atomic_inc(&ctxt->read_ios);
+
+	list_add_tail(&io->read_list, &ctxt->reads);
+	list_add_tail(&io->io_list, &ctxt->ios);
+	mutex_unlock(&ctxt->lock);
+
+	/*
+	 * dropped by move_read_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&ctxt->cl);
+	bch2_read_extent(trans, &io->rbio,
+			 bkey_start_pos(k.k),
+			 iter->btree_id, k, 0,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
+	return 0;
+err_free_pages:
+	bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+	kfree(io);
+err:
+	if (ret == -BCH_ERR_data_update_done)
+		return 0;
+
+	if (bch2_err_matches(ret, EROFS) ||
+	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
+	if (trace_move_extent_start_fail_enabled()) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		prt_str(&buf, ": ");
+		prt_str(&buf, bch2_err_str(ret));
+		trace_move_extent_start_fail(c, buf.buf);
+		printbuf_exit(&buf);
+	}
+	return ret;
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+			  struct per_snapshot_io_opts *io_opts,
+			  struct bkey_s_c extent_k)
+{
+	struct bch_fs *c = trans->c;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	if (io_opts->cur_inum != extent_k.k->p.inode) {
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		io_opts->d.nr = 0;
+
+		for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+			if (k.k->p.offset != extent_k.k->p.inode)
+				break;
+
+			if (!bkey_is_inode(k.k))
+				continue;
+
+			struct bch_inode_unpacked inode;
+			BUG_ON(bch2_inode_unpack(k, &inode));
+
+			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+			ret = darray_push(&io_opts->d, e);
+			if (ret)
+				break;
+		}
+		bch2_trans_iter_exit(trans, &iter);
+		io_opts->cur_inum = extent_k.k->p.inode;
+	}
+
+	ret = ret ?: trans_was_restarted(trans, restart_count);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (extent_k.k->p.snapshot) {
+		struct snapshot_io_opts_entry *i;
+		darray_for_each(io_opts->d, i)
+			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+				return &i->io_opts;
+	}
+
+	return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+			      struct bch_io_opts *io_opts,
+			      struct bkey_s_c extent_k)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/* reflink btree? */
+	if (!extent_k.k->p.inode) {
+		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+		return 0;
+	}
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+			       BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		return ret;
+
+	if (!ret && bkey_is_inode(k.k)) {
+		struct bch_inode_unpacked inode;
+		bch2_inode_unpack(k, &inode);
+		bch2_inode_opts_get(io_opts, trans->c, &inode);
+	} else {
+		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+int bch2_move_ratelimit(struct moving_context *ctxt)
+{
+	struct bch_fs *c = ctxt->trans->c;
+	bool is_kthread = current->flags & PF_KTHREAD;
+	u64 delay;
+
+	if (ctxt->wait_on_copygc && c->copygc_running) {
+		bch2_moving_ctxt_flush_all(ctxt);
+		wait_event_killable(c->copygc_running_wq,
+				    !c->copygc_running ||
+				    (is_kthread && kthread_should_stop()));
+	}
+
+	do {
+		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
+		if (is_kthread && kthread_should_stop())
+			return 1;
+
+		if (delay)
+			move_ctxt_wait_event_timeout(ctxt,
+					freezing(current) ||
+					(is_kthread && kthread_should_stop()),
+					delay);
+
+		if (unlikely(freezing(current))) {
+			bch2_moving_ctxt_flush_all(ctxt);
+			try_to_freeze();
+		}
+	} while (delay);
+
+	/*
+	 * XXX: these limits really ought to be per device, SSDs and hard drives
+	 * will want different limits
+	 */
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
+		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
+		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
+
+	return 0;
+}
+
+static int bch2_move_data_btree(struct moving_context *ctxt,
+				struct bpos start,
+				struct bpos end,
+				move_pred_fn pred, void *arg,
+				enum btree_id btree_id)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct per_snapshot_io_opts snapshot_io_opts;
+	struct bch_io_opts *io_opts;
+	struct bkey_buf sk;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	int ret = 0, ret2;
+
+	per_snapshot_io_opts_init(&snapshot_io_opts, c);
+	bch2_bkey_buf_init(&sk);
+
+	if (ctxt->stats) {
+		ctxt->stats->data_type	= BCH_DATA_user;
+		ctxt->stats->pos	= BBPOS(btree_id, start);
+	}
+
+	bch2_trans_iter_init(trans, &iter, btree_id, start,
+			     BTREE_ITER_PREFETCH|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	if (ctxt->rate)
+		bch2_ratelimit_reset(ctxt->rate);
+
+	while (!bch2_move_ratelimit(ctxt)) {
+		bch2_trans_begin(trans);
+
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		if (bkey_ge(bkey_start_pos(k.k), end))
+			break;
+
+		if (ctxt->stats)
+			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+		if (!bkey_extent_is_direct_data(k.k))
+			goto next_nondata;
+
+		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+		ret = PTR_ERR_OR_ZERO(io_opts);
+		if (ret)
+			continue;
+
+		memset(&data_opts, 0, sizeof(data_opts));
+		if (!pred(c, arg, k, io_opts, &data_opts))
+			goto next;
+
+		/*
+		 * The iterator gets unlocked by __bch2_read_extent - need to
+		 * save a copy of @k elsewhere:
+		 */
+		bch2_bkey_buf_reassemble(&sk, c, k);
+		k = bkey_i_to_s_c(sk.k);
+
+		ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
+		if (ret2) {
+			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
+				continue;
+
+			if (ret2 == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt);
+				continue;
+			}
+
+			/* XXX signal failure */
+			goto next;
+		}
+next:
+		if (ctxt->stats)
+			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_advance(&iter);
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_bkey_buf_exit(&sk, c);
+	per_snapshot_io_opts_exit(&snapshot_io_opts);
+
+	return ret;
+}
+
+int __bch2_move_data(struct moving_context *ctxt,
+		     struct bbpos start,
+		     struct bbpos end,
+		     move_pred_fn pred, void *arg)
+{
+	struct bch_fs *c = ctxt->trans->c;
+	enum btree_id id;
+	int ret = 0;
+
+	for (id = start.btree;
+	     id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+	     id++) {
+		ctxt->stats->pos = BBPOS(id, POS_MIN);
+
+		if (!btree_type_has_ptrs(id) ||
+		    !bch2_btree_id_root(c, id)->b)
+			continue;
+
+		ret = bch2_move_data_btree(ctxt,
+				       id == start.btree ? start.pos : POS_MIN,
+				       id == end.btree   ? end.pos   : POS_MAX,
+				       pred, arg, id);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bbpos start,
+		   struct bbpos end,
+		   struct bch_ratelimit *rate,
+		   struct bch_move_stats *stats,
+		   struct write_point_specifier wp,
+		   bool wait_on_copygc,
+		   move_pred_fn pred, void *arg)
+{
+
+	struct moving_context ctxt;
+	int ret;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+	ret = __bch2_move_data(&ctxt, start, end, pred, arg);
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return ret;
+}
+
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
+			   struct move_bucket_in_flight *bucket_in_flight,
+			   struct bpos bucket, int gen,
+			   struct data_update_opts _data_opts)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	bool is_kthread = current->flags & PF_KTHREAD;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree_iter iter;
+	struct bkey_buf sk;
+	struct bch_backpointer bp;
+	struct bch_alloc_v4 a_convert;
+	const struct bch_alloc_v4 *a;
+	struct bkey_s_c k;
+	struct data_update_opts data_opts;
+	unsigned dirty_sectors, bucket_size;
+	u64 fragmentation;
+	struct bpos bp_pos = POS_MIN;
+	int ret = 0;
+
+	trace_bucket_evacuate(c, &bucket);
+
+	bch2_bkey_buf_init(&sk);
+
+	/*
+	 * We're not run in a context that handles transaction restarts:
+	 */
+	bch2_trans_begin(trans);
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+			     bucket, BTREE_ITER_CACHED);
+	ret = lockrestart_do(trans,
+			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		bch_err_msg(c, ret, "looking up alloc key");
+		goto err;
+	}
+
+	a = bch2_alloc_to_v4(k, &a_convert);
+	dirty_sectors = a->dirty_sectors;
+	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
+	fragmentation = a->fragmentation_lru;
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (ret) {
+		bch_err_msg(c, ret, "flushing btree write buffer");
+		goto err;
+	}
+
+	while (!(ret = bch2_move_ratelimit(ctxt))) {
+		if (is_kthread && kthread_should_stop())
+			break;
+
+		bch2_trans_begin(trans);
+
+		ret = bch2_get_next_backpointer(trans, bucket, gen,
+						&bp_pos, &bp,
+						BTREE_ITER_CACHED);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			goto err;
+		if (bkey_eq(bp_pos, POS_MAX))
+			break;
+
+		if (!bp.level) {
+			const struct bch_extent_ptr *ptr;
+			unsigned i = 0;
+
+			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
+			ret = bkey_err(k);
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!k.k)
+				goto next;
+
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			k = bkey_i_to_s_c(sk.k);
+
+			ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+			if (ret) {
+				bch2_trans_iter_exit(trans, &iter);
+				continue;
+			}
+
+			data_opts = _data_opts;
+			data_opts.target	= io_opts.background_target;
+			data_opts.rewrite_ptrs = 0;
+
+			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+				if (ptr->dev == bucket.inode) {
+					data_opts.rewrite_ptrs |= 1U << i;
+					if (ptr->cached) {
+						bch2_trans_iter_exit(trans, &iter);
+						goto next;
+					}
+				}
+				i++;
+			}
+
+			ret = bch2_move_extent(ctxt, bucket_in_flight,
+					       &iter, k, io_opts, data_opts);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(ctxt);
+				continue;
+			}
+			if (ret)
+				goto err;
+
+			if (ctxt->stats)
+				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+		} else {
+			struct btree *b;
+
+			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
+			ret = PTR_ERR_OR_ZERO(b);
+			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+				continue;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+			if (!b)
+				goto next;
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+			bch2_trans_iter_exit(trans, &iter);
+
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				goto err;
+
+			if (ctxt->rate)
+				bch2_ratelimit_increment(ctxt->rate,
+							 c->opts.btree_node_size >> 9);
+			if (ctxt->stats) {
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
+				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+			}
+		}
+next:
+		bp_pos = bpos_nosnap_successor(bp_pos);
+	}
+
+	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
+err:
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+int bch2_evacuate_bucket(struct bch_fs *c,
+			 struct bpos bucket, int gen,
+			 struct data_update_opts data_opts,
+			 struct bch_ratelimit *rate,
+			 struct bch_move_stats *stats,
+			 struct write_point_specifier wp,
+			 bool wait_on_copygc)
+{
+	struct moving_context ctxt;
+	int ret;
+
+	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+	ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return ret;
+}
+
+typedef bool (*move_btree_pred)(struct bch_fs *, void *,
+				struct btree *, struct bch_io_opts *,
+				struct data_update_opts *);
+
+static int bch2_move_btree(struct bch_fs *c,
+			   enum btree_id start_btree_id, struct bpos start_pos,
+			   enum btree_id end_btree_id,   struct bpos end_pos,
+			   move_btree_pred pred, void *arg,
+			   struct bch_move_stats *stats)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct moving_context ctxt;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct btree *b;
+	enum btree_id id;
+	struct data_update_opts data_opts;
+	int ret = 0;
+
+	bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+			      writepoint_ptr(&c->btree_write_point),
+			      true);
+	trans = ctxt.trans;
+
+	stats->data_type = BCH_DATA_btree;
+
+	for (id = start_btree_id;
+	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+	     id++) {
+		stats->pos = BBPOS(id, POS_MIN);
+
+		if (!bch2_btree_id_root(c, id)->b)
+			continue;
+
+		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+					  BTREE_ITER_PREFETCH);
+retry:
+		ret = 0;
+		while (bch2_trans_begin(trans),
+		       (b = bch2_btree_iter_peek_node(&iter)) &&
+		       !(ret = PTR_ERR_OR_ZERO(b))) {
+			if (kthread && kthread_should_stop())
+				break;
+
+			if ((cmp_int(id, end_btree_id) ?:
+			     bpos_cmp(b->key.k.p, end_pos)) > 0)
+				break;
+
+			stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+			if (!pred(c, arg, b, &io_opts, &data_opts))
+				goto next;
+
+			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+				continue;
+			if (ret)
+				break;
+next:
+			bch2_btree_iter_next_node(&iter);
+		}
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto retry;
+
+		bch2_trans_iter_exit(trans, &iter);
+
+		if (kthread && kthread_should_stop())
+			break;
+	}
+
+	bch_err_fn(c, ret);
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_btree_interior_updates_flush(c);
+
+	return ret;
+}
+
+static bool rereplicate_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
+{
+	unsigned nr_good = bch2_bkey_durability(c, k);
+	unsigned replicas = bkey_is_btree_ptr(k.k)
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+
+	if (!nr_good || nr_good >= replicas)
+		return false;
+
+	data_opts->target		= 0;
+	data_opts->extra_replicas	= replicas - nr_good;
+	data_opts->btree_insert_flags	= 0;
+	return true;
+}
+
+static bool migrate_pred(struct bch_fs *c, void *arg,
+			 struct bkey_s_c k,
+			 struct bch_io_opts *io_opts,
+			 struct data_update_opts *data_opts)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const struct bch_extent_ptr *ptr;
+	struct bch_ioctl_data *op = arg;
+	unsigned i = 0;
+
+	data_opts->rewrite_ptrs		= 0;
+	data_opts->target		= 0;
+	data_opts->extra_replicas	= 0;
+	data_opts->btree_insert_flags	= 0;
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (ptr->dev == op->migrate.dev)
+			data_opts->rewrite_ptrs |= 1U << i;
+		i++;
+	}
+
+	return data_opts->rewrite_ptrs != 0;
+}
+
+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool migrate_btree_pred(struct bch_fs *c, void *arg,
+			       struct btree *b,
+			       struct bch_io_opts *io_opts,
+			       struct data_update_opts *data_opts)
+{
+	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+	unsigned i;
+
+	for (i = 0; i < f->nr_fields; i++) {
+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+		if (f->bits_per_field[i] > unpacked_bits)
+			return true;
+
+		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+			return true;
+
+		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+		     unpacked_mask) <
+		    field_offset)
+			return true;
+	}
+
+	return false;
+}
+
+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	if (b->version_ondisk != c->sb.version ||
+	    btree_node_need_rewrite(b) ||
+	    bformat_needs_redo(&b->format)) {
+		data_opts->target		= 0;
+		data_opts->extra_replicas	= 0;
+		data_opts->btree_insert_flags	= 0;
+		return true;
+	}
+
+	return false;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+	int ret;
+
+	ret = bch2_move_btree(c,
+			      0,		POS_MIN,
+			      BTREE_ID_NR,	SPOS_MAX,
+			      rewrite_old_nodes_pred, c, stats);
+	if (!ret) {
+		mutex_lock(&c->sb_lock);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_data_job(struct bch_fs *c,
+		  struct bch_move_stats *stats,
+		  struct bch_ioctl_data op)
+{
+	int ret = 0;
+
+	switch (op.op) {
+	case BCH_DATA_OP_REREPLICATE:
+		bch2_move_stats_init(stats, "rereplicate");
+		stats->data_type = BCH_DATA_journal;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      rereplicate_btree_pred, c, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c,
+				     (struct bbpos) { op.start_btree,	op.start_pos },
+				     (struct bbpos) { op.end_btree,	op.end_pos },
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     rereplicate_pred, c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		bch2_move_stats_exit(stats, c);
+		break;
+	case BCH_DATA_OP_MIGRATE:
+		if (op.migrate.dev >= c->sb.nr_devices)
+			return -EINVAL;
+
+		bch2_move_stats_init(stats, "migrate");
+		stats->data_type = BCH_DATA_journal;
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+		ret = bch2_move_btree(c,
+				      op.start_btree,	op.start_pos,
+				      op.end_btree,	op.end_pos,
+				      migrate_btree_pred, &op, stats) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		ret = bch2_move_data(c,
+				     (struct bbpos) { op.start_btree,	op.start_pos },
+				     (struct bbpos) { op.end_btree,	op.end_pos },
+				     NULL,
+				     stats,
+				     writepoint_hashed((unsigned long) current),
+				     true,
+				     migrate_pred, &op) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
+
+		bch2_move_stats_exit(stats, c);
+		break;
+	case BCH_DATA_OP_REWRITE_OLD_NODES:
+		bch2_move_stats_init(stats, "rewrite_old_nodes");
+		ret = bch2_scan_old_btree_nodes(c, stats);
+		bch2_move_stats_exit(stats, c);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
+{
+	prt_printf(out, "%s: data type=%s pos=",
+		   stats->name,
+		   bch2_data_types[stats->data_type]);
+	bch2_bbpos_to_text(out, stats->pos);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	prt_str(out, "keys moved:  ");
+	prt_u64(out, atomic64_read(&stats->keys_moved));
+	prt_newline(out);
+
+	prt_str(out, "keys raced:  ");
+	prt_u64(out, atomic64_read(&stats->keys_raced));
+	prt_newline(out);
+
+	prt_str(out, "bytes seen:  ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+	prt_newline(out);
+
+	prt_str(out, "bytes moved: ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
+	prt_newline(out);
+
+	prt_str(out, "bytes raced: ");
+	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	bch2_move_stats_to_text(out, ctxt->stats);
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
+		   atomic_read(&ctxt->read_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->read_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+	prt_newline(out);
+
+	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
+		   atomic_read(&ctxt->write_ios),
+		   c->opts.move_ios_in_flight,
+		   atomic_read(&ctxt->write_sectors),
+		   c->opts.move_bytes_in_flight >> 9);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	mutex_lock(&ctxt->lock);
+	list_for_each_entry(io, &ctxt->ios, io_list)
+		bch2_write_op_to_text(out, &io->write.op);
+	mutex_unlock(&ctxt->lock);
+
+	printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct moving_context *ctxt;
+
+	mutex_lock(&c->moving_context_lock);
+	list_for_each_entry(ctxt, &c->moving_context_list, list)
+		bch2_moving_ctxt_to_text(out, c, ctxt);
+	mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+	INIT_LIST_HEAD(&c->moving_context_list);
+	mutex_init(&c->moving_context_lock);
+}
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
new file mode 100644
index 000000000000..0906aa2d1de2
--- /dev/null
+++ b/fs/bcachefs/move.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "bbpos.h"
+#include "bcachefs_ioctl.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "data_update.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+
+struct moving_context {
+	struct btree_trans	*trans;
+	struct list_head	list;
+	void			*fn;
+
+	struct bch_ratelimit	*rate;
+	struct bch_move_stats	*stats;
+	struct write_point_specifier wp;
+	bool			wait_on_copygc;
+	bool			write_error;
+
+	/* For waiting on outstanding reads and writes: */
+	struct closure		cl;
+
+	struct mutex		lock;
+	struct list_head	reads;
+	struct list_head	ios;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+	atomic_t		read_ios;
+	atomic_t		write_ios;
+
+	wait_queue_head_t	wait;
+};
+
+#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout)			\
+({										\
+	int _ret = 0;								\
+	while (true) {								\
+		bool cond_finished = false;					\
+		bch2_moving_ctxt_do_pending_writes(_ctxt);			\
+										\
+		if (_cond)							\
+			break;							\
+		bch2_trans_unlock_long((_ctxt)->trans);				\
+		_ret = __wait_event_timeout((_ctxt)->wait,			\
+			     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+			     (cond_finished = (_cond)), _timeout);		\
+		if (_ret || ( cond_finished))					\
+			break;							\
+	}									\
+	_ret;									\
+})
+
+#define move_ctxt_wait_event(_ctxt, _cond)				\
+do {									\
+	bool cond_finished = false;					\
+	bch2_moving_ctxt_do_pending_writes(_ctxt);			\
+									\
+	if (_cond)							\
+		break;							\
+	bch2_trans_unlock_long((_ctxt)->trans);				\
+	__wait_event((_ctxt)->wait,					\
+		     bch2_moving_ctxt_next_pending_write(_ctxt) ||	\
+		     (cond_finished = (_cond)));			\
+	if (cond_finished)						\
+		break;							\
+} while (1)
+
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+			     struct bch_io_opts *, struct data_update_opts *);
+
+void bch2_moving_ctxt_exit(struct moving_context *);
+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
+			   struct bch_ratelimit *, struct bch_move_stats *,
+			   struct write_point_specifier, bool);
+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+	u32			snapshot;
+	struct bch_io_opts	io_opts;
+};
+
+struct per_snapshot_io_opts {
+	u64			cur_inum;
+	struct bch_io_opts	fs_io_opts;
+	DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+	memset(io_opts, 0, sizeof(*io_opts));
+	io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+	darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+				struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
+
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_extent(struct moving_context *,
+		     struct move_bucket_in_flight *,
+		     struct btree_iter *,
+		     struct bkey_s_c,
+		     struct bch_io_opts,
+		     struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+		     struct bbpos,
+		     struct bbpos,
+		     move_pred_fn, void *);
+int bch2_move_data(struct bch_fs *,
+		   struct bbpos start,
+		   struct bbpos end,
+		   struct bch_ratelimit *,
+		   struct bch_move_stats *,
+		   struct write_point_specifier,
+		   bool,
+		   move_pred_fn, void *);
+
+int __bch2_evacuate_bucket(struct moving_context *,
+			   struct move_bucket_in_flight *,
+			   struct bpos, int,
+			   struct data_update_opts);
+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
+			 struct data_update_opts,
+			 struct bch_ratelimit *,
+			 struct bch_move_stats *,
+			 struct write_point_specifier,
+			 bool);
+int bch2_data_job(struct bch_fs *,
+		  struct bch_move_stats *,
+		  struct bch_ioctl_data);
+
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_fs_move_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVE_H */
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
new file mode 100644
index 000000000000..e22841ef31e4
--- /dev/null
+++ b/fs/bcachefs/move_types.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+#include "bbpos_types.h"
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	struct bbpos		pos;
+	char			name[32];
+
+	atomic64_t		keys_moved;
+	atomic64_t		keys_raced;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_raced;
+};
+
+struct move_bucket_key {
+	struct bpos		bucket;
+	u8			gen;
+};
+
+struct move_bucket {
+	struct move_bucket_key	k;
+	unsigned		sectors;
+};
+
+struct move_bucket_in_flight {
+	struct move_bucket_in_flight *next;
+	struct rhash_head	hash;
+	struct move_bucket	bucket;
+	atomic_t		count;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
new file mode 100644
index 000000000000..a84e79f79e5e
--- /dev/null
+++ b/fs/bcachefs/movinggc.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "clock.h"
+#include "errcode.h"
+#include "error.h"
+#include "lru.h"
+#include "move.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/wait.h>
+
+struct buckets_in_flight {
+	struct rhashtable		table;
+	struct move_bucket_in_flight	*first;
+	struct move_bucket_in_flight	*last;
+	size_t				nr;
+	size_t				sectors;
+};
+
+static const struct rhashtable_params bch_move_bucket_params = {
+	.head_offset	= offsetof(struct move_bucket_in_flight, hash),
+	.key_offset	= offsetof(struct move_bucket_in_flight, bucket.k),
+	.key_len	= sizeof(struct move_bucket_key),
+};
+
+static struct move_bucket_in_flight *
+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+{
+	struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
+	int ret;
+
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+
+	new->bucket = b;
+
+	ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
+					    bch_move_bucket_params);
+	if (ret) {
+		kfree(new);
+		return ERR_PTR(ret);
+	}
+
+	if (!list->first)
+		list->first = new;
+	else
+		list->last->next = new;
+
+	list->last = new;
+	list->nr++;
+	list->sectors += b.sectors;
+	return new;
+}
+
+static int bch2_bucket_is_movable(struct btree_trans *trans,
+				  struct move_bucket *b, u64 time)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_alloc_v4 _a;
+	const struct bch_alloc_v4 *a;
+	int ret;
+
+	if (bch2_bucket_is_open(trans->c,
+				b->k.bucket.inode,
+				b->k.bucket.offset))
+		return 0;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+			       b->k.bucket, BTREE_ITER_CACHED);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	a = bch2_alloc_to_v4(k, &_a);
+	b->k.gen	= a->gen;
+	b->sectors	= a->dirty_sectors;
+
+	ret = data_type_movable(a->data_type) &&
+		a->fragmentation_lru &&
+		a->fragmentation_lru <= time;
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void move_buckets_wait(struct moving_context *ctxt,
+			      struct buckets_in_flight *list,
+			      bool flush)
+{
+	struct move_bucket_in_flight *i;
+	int ret;
+
+	while ((i = list->first)) {
+		if (flush)
+			move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
+
+		if (atomic_read(&i->count))
+			break;
+
+		list->first = i->next;
+		if (!list->first)
+			list->last = NULL;
+
+		list->nr--;
+		list->sectors -= i->bucket.sectors;
+
+		ret = rhashtable_remove_fast(&list->table, &i->hash,
+					     bch_move_bucket_params);
+		BUG_ON(ret);
+		kfree(i);
+	}
+
+	bch2_trans_unlock_long(ctxt->trans);
+}
+
+static bool bucket_in_flight(struct buckets_in_flight *list,
+			     struct move_bucket_key k)
+{
+	return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+}
+
+typedef DARRAY(struct move_bucket) move_buckets;
+
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
+			struct buckets_in_flight *buckets_in_flight,
+			move_buckets *buckets)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
+	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
+	int ret;
+
+	move_buckets_wait(ctxt, buckets_in_flight, false);
+
+	ret = bch2_btree_write_buffer_flush(trans);
+	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+				 __func__, bch2_err_str(ret)))
+		return ret;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
+				  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+				  0, k, ({
+		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
+		int ret2 = 0;
+
+		saw++;
+
+		if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+			not_movable++;
+		else if (bucket_in_flight(buckets_in_flight, b.k))
+			in_flight++;
+		else {
+			ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+			if (ret2 >= 0)
+				sectors += b.sectors;
+		}
+		ret2;
+	}));
+
+	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
+		 buckets_in_flight->nr, buckets_in_flight->sectors,
+		 saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+
+	return ret < 0 ? ret : 0;
+}
+
+noinline
+static int bch2_copygc(struct moving_context *ctxt,
+		       struct buckets_in_flight *buckets_in_flight,
+		       bool *did_work)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct data_update_opts data_opts = {
+		.btree_insert_flags = BCH_WATERMARK_copygc,
+	};
+	move_buckets buckets = { 0 };
+	struct move_bucket_in_flight *f;
+	struct move_bucket *i;
+	u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
+	int ret = 0;
+
+	ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
+	if (ret)
+		goto err;
+
+	darray_for_each(buckets, i) {
+		if (kthread_should_stop() || freezing(current))
+			break;
+
+		f = move_bucket_in_flight_add(buckets_in_flight, *i);
+		ret = PTR_ERR_OR_ZERO(f);
+		if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+			ret = 0;
+			continue;
+		}
+		if (ret == -ENOMEM) { /* flush IO, continue later */
+			ret = 0;
+			break;
+		}
+
+		ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+					     f->bucket.k.gen, data_opts);
+		if (ret)
+			goto err;
+
+		*did_work = true;
+	}
+err:
+	darray_exit(&buckets);
+
+	/* no entries in LRU btree found, or got to end: */
+	if (bch2_err_matches(ret, ENOENT))
+		ret = 0;
+
+	if (ret < 0 && !bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "from bch2_move_data()");
+
+	moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+	trace_and_count(c, copygc, c, moved, 0, 0, 0);
+	return ret;
+}
+
+/*
+ * Copygc runs when the amount of fragmented data is above some arbitrary
+ * threshold:
+ *
+ * The threshold at the limit - when the device is full - is the amount of space
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
+ * disk space stranded due to fragmentation and store everything we have
+ * promised to store.
+ *
+ * But we don't want to be running copygc unnecessarily when the device still
+ * has plenty of free space - rather, we want copygc to smoothly run every so
+ * often and continually reduce the amount of fragmented space as the device
+ * fills up. So, we increase the threshold by half the current free space.
+ */
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	s64 wait = S64_MAX, fragmented_allowed, fragmented;
+	unsigned i;
+
+	for_each_rw_member(ca, c, dev_idx) {
+		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+
+		fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+				       ca->mi.bucket_size) >> 1);
+		fragmented = 0;
+
+		for (i = 0; i < BCH_DATA_NR; i++)
+			if (data_type_movable(i))
+				fragmented += usage.d[i].fragmented;
+
+		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
+	}
+
+	return wait;
+}
+
+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	prt_printf(out, "Currently waiting for:     ");
+	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
+					atomic64_read(&c->io_clock[WRITE].now)) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently waiting since:   ");
+	prt_human_readable_u64(out, max(0LL,
+					atomic64_read(&c->io_clock[WRITE].now) -
+					c->copygc_wait_at) << 9);
+	prt_newline(out);
+
+	prt_printf(out, "Currently calculated wait: ");
+	prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
+	prt_newline(out);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct moving_context ctxt;
+	struct bch_move_stats move_stats;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct buckets_in_flight *buckets;
+	u64 last, wait;
+	int ret = 0;
+
+	buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+	if (!buckets)
+		return -ENOMEM;
+	ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+	if (ret) {
+		kfree(buckets);
+		bch_err_msg(c, ret, "allocating copygc buckets in flight");
+		return ret;
+	}
+
+	set_freezable();
+
+	bch2_move_stats_init(&move_stats, "copygc");
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
+			      writepoint_ptr(&c->copygc_write_point),
+			      false);
+
+	while (!ret && !kthread_should_stop()) {
+		bool did_work = false;
+
+		bch2_trans_unlock_long(ctxt.trans);
+		cond_resched();
+
+		if (!c->copy_gc_enabled) {
+			move_buckets_wait(&ctxt, buckets, true);
+			kthread_wait_freezable(c->copy_gc_enabled);
+		}
+
+		if (unlikely(freezing(current))) {
+			move_buckets_wait(&ctxt, buckets, true);
+			__refrigerator(false);
+			continue;
+		}
+
+		last = atomic64_read(&clock->now);
+		wait = bch2_copygc_wait_amount(c);
+
+		if (wait > clock->max_slop) {
+			c->copygc_wait_at = last;
+			c->copygc_wait = last + wait;
+			move_buckets_wait(&ctxt, buckets, true);
+			trace_and_count(c, copygc_wait, c, wait, last + wait);
+			bch2_kthread_io_clock_wait(clock, last + wait,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		c->copygc_wait = 0;
+
+		c->copygc_running = true;
+		ret = bch2_copygc(&ctxt, buckets, &did_work);
+		c->copygc_running = false;
+
+		wake_up(&c->copygc_running_wq);
+
+		if (!wait && !did_work) {
+			u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+			if (min_member_capacity == U64_MAX)
+				min_member_capacity = 128 * 2048;
+
+			bch2_trans_unlock_long(ctxt.trans);
+			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+					MAX_SCHEDULE_TIMEOUT);
+		}
+	}
+
+	move_buckets_wait(&ctxt, buckets, true);
+
+	rhashtable_destroy(&buckets->table);
+	kfree(buckets);
+	bch2_moving_ctxt_exit(&ctxt);
+	bch2_move_stats_exit(&move_stats, c);
+
+	return 0;
+}
+
+void bch2_copygc_stop(struct bch_fs *c)
+{
+	if (c->copygc_thread) {
+		kthread_stop(c->copygc_thread);
+		put_task_struct(c->copygc_thread);
+	}
+	c->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c)
+{
+	struct task_struct *t;
+	int ret;
+
+	if (c->copygc_thread)
+		return 0;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	if (bch2_fs_init_fault("copygc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(t);
+	if (ret) {
+		bch_err_msg(c, ret, "creating copygc thread");
+		return ret;
+	}
+
+	get_task_struct(t);
+
+	c->copygc_thread = t;
+	wake_up_process(c->copygc_thread);
+
+	return 0;
+}
+
+void bch2_fs_copygc_init(struct bch_fs *c)
+{
+	init_waitqueue_head(&c->copygc_running_wq);
+	c->copygc_running = false;
+}
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
new file mode 100644
index 000000000000..ea181fef5bc9
--- /dev/null
+++ b/fs/bcachefs/movinggc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_copygc_stop(struct bch_fs *);
+int bch2_copygc_start(struct bch_fs *);
+void bch2_fs_copygc_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
new file mode 100644
index 000000000000..3c21981a4a1c
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "nocow_locking.h"
+#include "util.h"
+
+#include <linux/closure.h>
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
+			return true;
+	return false;
+}
+
+#define sign(v)		(v < 0 ? -1 : v > 0 ? 1 : 0)
+
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+	int lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket) {
+			int v = atomic_sub_return(lock_val, &l->l[i]);
+
+			BUG_ON(v && sign(v) != lock_val);
+			if (!v)
+				closure_wake_up(&l->wait);
+			return;
+		}
+
+	BUG();
+}
+
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
+				 u64 dev_bucket, int flags)
+{
+	int v, lock_val = flags ? 1 : -1;
+	unsigned i;
+
+	spin_lock(&l->lock);
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (l->b[i] == dev_bucket)
+			goto got_entry;
+
+	for (i = 0; i < ARRAY_SIZE(l->b); i++)
+		if (!atomic_read(&l->l[i])) {
+			l->b[i] = dev_bucket;
+			goto take_lock;
+		}
+fail:
+	spin_unlock(&l->lock);
+	return false;
+got_entry:
+	v = atomic_read(&l->l[i]);
+	if (lock_val > 0 ? v < 0 : v > 0)
+		goto fail;
+take_lock:
+	v = atomic_read(&l->l[i]);
+	/* Overflow? */
+	if (v && sign(v + lock_val) != sign(v))
+		goto fail;
+
+	atomic_add(lock_val, &l->l[i]);
+	spin_unlock(&l->lock);
+	return true;
+}
+
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+			      struct nocow_lock_bucket *l,
+			      u64 dev_bucket, int flags)
+{
+	if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
+		struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
+		u64 start_time = local_clock();
+
+		__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
+		bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+	}
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
+{
+	unsigned i, nr_zero = 0;
+	struct nocow_lock_bucket *l;
+
+	for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
+		unsigned v = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++)
+			v |= atomic_read(&l->l[i]);
+
+		if (!v) {
+			nr_zero++;
+			continue;
+		}
+
+		if (nr_zero)
+			prt_printf(out, "(%u empty entries)\n", nr_zero);
+		nr_zero = 0;
+
+		for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+			int v = atomic_read(&l->l[i]);
+			if (v) {
+				bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+				prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+			}
+		}
+		prt_newline(out);
+	}
+
+	if (nr_zero)
+		prt_printf(out, "(%u empty entries)\n", nr_zero);
+}
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+			BUG_ON(atomic_read(&l->l[j]));
+}
+
+int bch2_fs_nocow_locking_init(struct bch_fs *c)
+{
+	struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+	for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+		spin_lock_init(&l->lock);
+
+	return 0;
+}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
new file mode 100644
index 000000000000..f9d6a426a960
--- /dev/null
+++ b/fs/bcachefs/nocow_locking.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_H
+#define _BCACHEFS_NOCOW_LOCKING_H
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "nocow_locking_types.h"
+
+#include <linux/hash.h>
+
+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+							  u64 dev_bucket)
+{
+	unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS);
+
+	return t->l + (h & (BUCKET_NOCOW_LOCKS - 1));
+}
+
+#define BUCKET_NOCOW_LOCK_UPDATE	(1 << 0)
+
+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos);
+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int);
+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int);
+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *,
+			      struct nocow_lock_bucket *, u64, int);
+
+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	__bch2_bucket_nocow_lock(t, l, dev_bucket, flags);
+}
+
+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
+					  struct bpos bucket, int flags)
+{
+	u64 dev_bucket = bucket_to_u64(bucket);
+	struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
+
+	return __bch2_bucket_nocow_trylock(l, dev_bucket, flags);
+}
+
+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
+int bch2_fs_nocow_locking_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h
new file mode 100644
index 000000000000..bd12bf677924
--- /dev/null
+++ b/fs/bcachefs/nocow_locking_types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H
+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H
+
+#define BUCKET_NOCOW_LOCKS_BITS		10
+#define BUCKET_NOCOW_LOCKS		(1U << BUCKET_NOCOW_LOCKS_BITS)
+
+struct nocow_lock_bucket {
+	struct closure_waitlist		wait;
+	spinlock_t			lock;
+	u64				b[4];
+	atomic_t			l[4];
+} __aligned(SMP_CACHE_BYTES);
+
+struct bucket_nocow_lock_table {
+	struct nocow_lock_bucket	l[BUCKET_NOCOW_LOCKS];
+};
+
+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */
+
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
new file mode 100644
index 000000000000..8dd4046cca41
--- /dev/null
+++ b/fs/bcachefs/opts.c
@@ -0,0 +1,602 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "error.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+#define x(t, n, ...) [n] = #t,
+
+const char * const bch2_error_actions[] = {
+	BCH_ERROR_ACTIONS()
+	NULL
+};
+
+const char * const bch2_fsck_fix_opts[] = {
+	BCH_FIX_ERRORS_OPTS()
+	NULL
+};
+
+const char * const bch2_version_upgrade_opts[] = {
+	BCH_VERSION_UPGRADE_OPTS()
+	NULL
+};
+
+const char * const bch2_sb_features[] = {
+	BCH_SB_FEATURES()
+	NULL
+};
+
+const char * const bch2_sb_compat[] = {
+	BCH_SB_COMPAT()
+	NULL
+};
+
+const char * const __bch2_btree_ids[] = {
+	BCH_BTREE_IDS()
+	NULL
+};
+
+const char * const bch2_csum_types[] = {
+	BCH_CSUM_TYPES()
+	NULL
+};
+
+const char * const bch2_csum_opts[] = {
+	BCH_CSUM_OPTS()
+	NULL
+};
+
+const char * const bch2_compression_types[] = {
+	BCH_COMPRESSION_TYPES()
+	NULL
+};
+
+const char * const bch2_compression_opts[] = {
+	BCH_COMPRESSION_OPTS()
+	NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+	BCH_STR_HASH_TYPES()
+	NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
+	BCH_STR_HASH_OPTS()
+	NULL
+};
+
+const char * const bch2_data_types[] = {
+	BCH_DATA_TYPES()
+	NULL
+};
+
+const char * const bch2_member_states[] = {
+	BCH_MEMBER_STATES()
+	NULL
+};
+
+const char * const bch2_jset_entry_types[] = {
+	BCH_JSET_ENTRY_TYPES()
+	NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+	BCH_FS_USAGE_TYPES()
+	NULL
+};
+
+#undef x
+
+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
+				     struct printbuf *err)
+{
+	if (!val) {
+		*res = FSCK_FIX_yes;
+	} else {
+		int ret = match_string(bch2_fsck_fix_opts, -1, val);
+
+		if (ret < 0 && err)
+			prt_str(err, "fix_errors: invalid selection");
+		if (ret < 0)
+			return ret;
+		*res = ret;
+	}
+
+	return 0;
+}
+
+static void bch2_opt_fix_errors_to_text(struct printbuf *out,
+					struct bch_fs *c,
+					struct bch_sb *sb,
+					u64 v)
+{
+	prt_str(out, bch2_fsck_fix_opts[v]);
+}
+
+#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
+	.parse = bch2_opt_fix_errors_parse,		\
+	.to_text = bch2_opt_fix_errors_to_text,		\
+}
+
+const char * const bch2_d_types[BCH_DT_MAX] = {
+	[DT_UNKNOWN]	= "unknown",
+	[DT_FIFO]	= "fifo",
+	[DT_CHR]	= "chr",
+	[DT_DIR]	= "dir",
+	[DT_BLK]	= "blk",
+	[DT_REG]	= "reg",
+	[DT_LNK]	= "lnk",
+	[DT_SOCK]	= "sock",
+	[DT_WHT]	= "whiteout",
+	[DT_SUBVOL]	= "subvol",
+};
+
+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
+{
+	BUG();
+}
+
+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
+{
+	BUG();
+}
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define x(_name, ...)						\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+
+	BCH_OPTS()
+#undef x
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define x(_name, ...)						\
+	case Opt_##_name:						\
+		opt_set(*opts, _name, v);				\
+		break;
+	BCH_OPTS()
+#undef x
+	default:
+		BUG();
+	}
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
+				.min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
+				.min = 0, .max = ARRAY_SIZE(_choices),	\
+				.choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
+
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
+	[Opt_##_name] = {						\
+		.attr	= {						\
+			.name	= #_name,				\
+			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
+		},							\
+		.flags	= _flags,					\
+		.hint	= _hint,					\
+		.help	= _help,					\
+		.get_sb = _sb_opt,					\
+		.set_sb	= SET_##_sb_opt,				\
+		_type							\
+	},
+
+	BCH_OPTS()
+#undef x
+};
+
+int bch2_opt_lookup(const char *name)
+{
+	const struct bch_option *i;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+	     i++)
+		if (!strcmp(name, i->attr.name))
+			return i - bch2_opt_table;
+
+	return -1;
+}
+
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
+{
+	if (v < opt->min) {
+		if (err)
+			prt_printf(err, "%s: too small (min %llu)",
+			       opt->attr.name, opt->min);
+		return -BCH_ERR_ERANGE_option_too_small;
+	}
+
+	if (opt->max && v >= opt->max) {
+		if (err)
+			prt_printf(err, "%s: too big (max %llu)",
+			       opt->attr.name, opt->max);
+		return -BCH_ERR_ERANGE_option_too_big;
+	}
+
+	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+		if (err)
+			prt_printf(err, "%s: not a multiple of 512",
+			       opt->attr.name);
+		return -EINVAL;
+	}
+
+	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+		if (err)
+			prt_printf(err, "%s: must be a power of two",
+			       opt->attr.name);
+		return -EINVAL;
+	}
+
+	if (opt->fn.validate)
+		return opt->fn.validate(v, err);
+
+	return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c,
+		   const struct bch_option *opt,
+		   const char *val, u64 *res,
+		   struct printbuf *err)
+{
+	ssize_t ret;
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+		if (val) {
+			ret = kstrtou64(val, 10, res);
+		} else {
+			ret = 0;
+			*res = 1;
+		}
+
+		if (ret < 0 || (*res != 0 && *res != 1)) {
+			if (err)
+				prt_printf(err, "%s: must be bool", opt->attr.name);
+			return ret;
+		}
+		break;
+	case BCH_OPT_UINT:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
+		ret = opt->flags & OPT_HUMAN_READABLE
+			? bch2_strtou64_h(val, res)
+			: kstrtou64(val, 10, res);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: must be a number",
+					   opt->attr.name);
+			return ret;
+		}
+		break;
+	case BCH_OPT_STR:
+		if (!val) {
+			prt_printf(err, "%s: required value",
+				   opt->attr.name);
+			return -EINVAL;
+		}
+
+		ret = match_string(opt->choices, -1, val);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: invalid selection",
+					   opt->attr.name);
+			return ret;
+		}
+
+		*res = ret;
+		break;
+	case BCH_OPT_FN:
+		ret = opt->fn.parse(c, val, res, err);
+		if (ret < 0) {
+			if (err)
+				prt_printf(err, "%s: parse error",
+					   opt->attr.name);
+			return ret;
+		}
+	}
+
+	return bch2_opt_validate(opt, *res, err);
+}
+
+void bch2_opt_to_text(struct printbuf *out,
+		      struct bch_fs *c, struct bch_sb *sb,
+		      const struct bch_option *opt, u64 v,
+		      unsigned flags)
+{
+	if (flags & OPT_SHOW_MOUNT_STYLE) {
+		if (opt->type == BCH_OPT_BOOL) {
+			prt_printf(out, "%s%s",
+			       v ? "" : "no",
+			       opt->attr.name);
+			return;
+		}
+
+		prt_printf(out, "%s=", opt->attr.name);
+	}
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+	case BCH_OPT_UINT:
+		if (opt->flags & OPT_HUMAN_READABLE)
+			prt_human_readable_u64(out, v);
+		else
+			prt_printf(out, "%lli", v);
+		break;
+	case BCH_OPT_STR:
+		if (flags & OPT_SHOW_FULL_LIST)
+			prt_string_option(out, opt->choices, v);
+		else
+			prt_str(out, opt->choices[v]);
+		break;
+	case BCH_OPT_FN:
+		opt->fn.to_text(out, c, sb, v);
+		break;
+	default:
+		BUG();
+	}
+}
+
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+{
+	int ret = 0;
+
+	switch (id) {
+	case Opt_compression:
+	case Opt_background_compression:
+		ret = bch2_check_set_has_compressed_data(c, v);
+		break;
+	case Opt_erasure_code:
+		if (v)
+			bch2_check_set_feature(c, BCH_FEATURE_ec);
+		break;
+	}
+
+	return ret;
+}
+
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		ret = bch2_opt_check_may_set(c, i,
+				bch2_opt_get_by_id(&c->opts, i));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
+			  char *options)
+{
+	char *copied_opts, *copied_opts_start;
+	char *opt, *name, *val;
+	int ret, id;
+	struct printbuf err = PRINTBUF;
+	u64 v;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * sys_fsconfig() is now occasionally providing us with option lists
+	 * starting with a comma - weird.
+	 */
+	if (*options == ',')
+		options++;
+
+	copied_opts = kstrdup(options, GFP_KERNEL);
+	if (!copied_opts)
+		return -1;
+	copied_opts_start = copied_opts;
+
+	while ((opt = strsep(&copied_opts, ",")) != NULL) {
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		id = bch2_mount_opt_lookup(name);
+
+		/* Check for the form "noopt", negation of a boolean opt: */
+		if (id < 0 &&
+		    !val &&
+		    !strncmp("no", name, 2)) {
+			id = bch2_mount_opt_lookup(name + 2);
+			val = "0";
+		}
+
+		/* Unknown options are ignored: */
+		if (id < 0)
+			continue;
+
+		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+			goto bad_opt;
+
+		if (id == Opt_acl &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+			goto bad_opt;
+
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
+		ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+		if (ret < 0)
+			goto bad_val;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	ret = 0;
+	goto out;
+
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	ret = -1;
+	goto out;
+bad_val:
+	pr_err("Invalid mount option %s", err.buf);
+	ret = -1;
+	goto out;
+out:
+	kfree(copied_opts_start);
+	printbuf_exit(&err);
+	return ret;
+}
+
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+{
+	const struct bch_option *opt = bch2_opt_table + id;
+	u64 v;
+
+	v = opt->get_sb(sb);
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = 1ULL << v;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v <<= 9;
+
+	return v;
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+	unsigned id;
+
+	for (id = 0; id < bch2_opts_nr; id++) {
+		const struct bch_option *opt = bch2_opt_table + id;
+
+		if (opt->get_sb == BCH2_NO_SB_OPT)
+			continue;
+
+		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+	}
+
+	return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+		return;
+
+	if (opt->flags & OPT_SB_FIELD_SECTORS)
+		v >>= 9;
+
+	if (opt->flags & OPT_SB_FIELD_ILOG2)
+		v = ilog2(v);
+
+	opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
+		return;
+
+	mutex_lock(&c->sb_lock);
+	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	return (struct bch_io_opts) {
+#define x(_name, _bits)	._name = src._name,
+	BCH_INODE_OPTS()
+#undef x
+	};
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define x(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef x
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
new file mode 100644
index 000000000000..8526f177450a
--- /dev/null
+++ b/fs/bcachefs/opts.h
@@ -0,0 +1,564 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+struct bch_fs;
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_fsck_fix_opts[];
+extern const char * const bch2_version_upgrade_opts[];
+extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const __bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_compression_opts[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
+extern const char * const bch2_d_types[];
+
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+u64 BCH2_NO_SB_OPT(const struct bch_sb *);
+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
+
+/* When can be set: */
+enum opt_flags {
+	OPT_FS		= (1 << 0),	/* Filesystem option */
+	OPT_DEVICE	= (1 << 1),	/* Device option */
+	OPT_INODE	= (1 << 2),	/* Inode option */
+	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
+	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
+	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
+	OPT_HUMAN_READABLE = (1 << 6),
+	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
+	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
+};
+
+enum opt_type {
+	BCH_OPT_BOOL,
+	BCH_OPT_UINT,
+	BCH_OPT_STR,
+	BCH_OPT_FN,
+};
+
+struct bch_opt_fn {
+	int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
+	void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+	int (*validate)(u64, struct printbuf *);
+};
+
+/**
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
+ *
+ * @name	- name of mount option, sysfs attribute, and struct bch_opts
+ *		  member
+ *
+ * @mode	- when opt may be set
+ *
+ * @sb_option	- name of corresponding superblock option
+ *
+ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#ifdef __KERNEL__
+#define RATELIMIT_ERRORS_DEFAULT true
+#else
+#define RATELIMIT_ERRORS_DEFAULT false
+#endif
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT	true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT	false
+#endif
+
+#define BCH_FIX_ERRORS_OPTS()		\
+	x(exit,	0)			\
+	x(yes,	1)			\
+	x(no,	2)			\
+	x(ask,	3)
+
+enum fsck_err_opts {
+#define x(t, n)	FSCK_FIX_##t,
+	BCH_FIX_ERRORS_OPTS()
+#undef x
+};
+
+#define BCH_OPTS()							\
+	x(block_size,			u16,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 16),					\
+	  BCH_SB_BLOCK_SIZE,		8,				\
+	  "size",	NULL)						\
+	x(btree_node_size,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
+	  OPT_UINT(512, 1U << 20),					\
+	  BCH_SB_BTREE_NODE_SIZE,	512,				\
+	  "size",	"Btree node size, default 256k")		\
+	x(errors,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_error_actions),					\
+	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
+	  NULL,		"Action to take on filesystem error")		\
+	x(metadata_replicas,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_WANT,	1,				\
+	  "#",		"Number of metadata replicas")			\
+	x(data_replicas,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
+	  "#",		"Number of data replicas")			\
+	x(metadata_replicas_required, u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_META_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(data_replicas_required,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
+	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
+	  "#",		NULL)						\
+	x(encoded_extent_max,		u32,				\
+	  OPT_FS|OPT_FORMAT|						\
+	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+	  OPT_UINT(4096, 2U << 20),					\
+	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
+	  "size",	"Maximum size of checksummed/compressed extents")\
+	x(metadata_checksum,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_csum_opts),					\
+	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(data_checksum,		u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_STR(bch2_csum_opts),					\
+	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
+	  NULL,		NULL)						\
+	x(compression,			u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(background_compression,	u8,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_compression),					\
+	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
+	  NULL,		NULL)						\
+	x(str_hash,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_STR(bch2_str_hash_opts),					\
+	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
+	  NULL,		"Hash function for directory entries and xattrs")\
+	x(metadata_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_METADATA_TARGET,	0,				\
+	  "(target)",	"Device or label for metadata writes")		\
+	x(foreground_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_FOREGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label for foreground writes")	\
+	x(background_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_BACKGROUND_TARGET,	0,				\
+	  "(target)",	"Device or label to move data to in the background")\
+	x(promote_target,		u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_PROMOTE_TARGET,	0,				\
+	  "(target)",	"Device or label to promote data to on read")	\
+	x(erasure_code,			u16,				\
+	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_ERASURE_CODE,		false,				\
+	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
+	x(inodes_32bit,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODE_32BIT,		true,				\
+	  NULL,		"Constrain inode numbers to 32 bits")		\
+	x(shard_inode_numbers,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_SHARD_INUMS,		true,				\
+	  NULL,		"Shard new inode numbers by CPU id")		\
+	x(inodes_use_key_cache,	u8,					\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
+	  NULL,		"Use the btree key cache for the inodes btree")	\
+	x(btree_node_mem_ptr_optimization, u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
+	x(btree_write_buffer_size, u32,					\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(16, (1U << 20) - 1),					\
+	  BCH2_NO_SB_OPT,		1U << 13,			\
+	  NULL,		"Number of btree write buffer entries")		\
+	x(gc_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(5, 21),						\
+	  BCH_SB_GC_RESERVE,		8,				\
+	  "%",		"Percentage of disk space to reserve for copygc")\
+	x(gc_reserve_bytes,		u64,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
+	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
+	  OPT_UINT(0, U64_MAX),						\
+	  BCH_SB_GC_RESERVE_BYTES,	0,				\
+	  "%",		"Amount of disk space to reserve for copygc\n"	\
+			"Takes precedence over gc_reserve_percent if set")\
+	x(root_reserve_percent,		u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_UINT(0, 100),						\
+	  BCH_SB_ROOT_RESERVE,		0,				\
+	  "%",		"Percentage of disk space to reserve for superuser")\
+	x(wide_macs,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_128_BIT_MACS,		false,				\
+	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
+	x(inline_data,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable inline data extents")			\
+	x(acl,				u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_POSIX_ACL,		true,				\
+	  NULL,		"Enable POSIX acls")				\
+	x(usrquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_USRQUOTA,		false,				\
+	  NULL,		"Enable user quotas")				\
+	x(grpquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_GRPQUOTA,		false,				\
+	  NULL,		"Enable group quotas")				\
+	x(prjquota,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_PRJQUOTA,		false,				\
+	  NULL,		"Enable project quotas")			\
+	x(degraded,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in degraded mode")		\
+	x(very_degraded,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allow mounting in when data will be missing")	\
+	x(discard,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		true,				\
+	  NULL,		"Enable discard/TRIM support")			\
+	x(verbose,			u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		BCACHEFS_VERBOSE_DEFAULT,	\
+	  NULL,		"Extra debugging information during mount/recovery")\
+	x(journal_flush_delay,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, U32_MAX),						\
+	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
+	  NULL,		"Delay in milliseconds before automatic journal commits")\
+	x(journal_flush_disabled,	u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
+	  NULL,		"Disable journal flush on sync/fsync\n"		\
+			"If enabled, writes can be lost, but only since the\n"\
+			"last journal write (default 1 second)")	\
+	x(journal_reclaim_delay,	u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(0, U32_MAX),						\
+	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
+	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
+	x(move_bytes_in_flight,		u32,				\
+	  OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME,		\
+	  OPT_UINT(1024, U32_MAX),					\
+	  BCH2_NO_SB_OPT,		1U << 20,			\
+	  NULL,		"Maximum Amount of IO to keep in flight by the move path")\
+	x(move_ios_in_flight,		u32,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_UINT(1, 1024),						\
+	  BCH2_NO_SB_OPT,		32,				\
+	  NULL,		"Maximum number of IOs to keep in flight by the move path")\
+	x(fsck,				u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Run fsck on mount")				\
+	x(fix_errors,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_FN(bch2_opt_fix_errors),					\
+	  BCH2_NO_SB_OPT,		FSCK_FIX_exit,			\
+	  NULL,		"Fix errors during fsck without asking")	\
+	x(ratelimit_errors,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
+	  NULL,		"Ratelimit error messages during fsck")		\
+	x(nochanges,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
+			"even if we have to replay the journal")	\
+	x(norecovery,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't replay the journal")			\
+	x(keep_journal,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't free journal entries/keys after startup")\
+	x(read_entire_journal,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Read all journal entries, not just dirty ones")\
+	x(read_journal_only,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Only read the journal, skip the rest of recovery")\
+	x(journal_transaction_names,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_BOOL(),							\
+	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
+	  NULL,		"Log transaction function names in journal")	\
+	x(noexcl,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't open device in exclusive mode")		\
+	x(direct_io,			u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Use O_DIRECT (userspace only)")		\
+	x(sb,				u64,				\
+	  OPT_MOUNT,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		BCH_SB_SECTOR,			\
+	  "offset",	"Sector offset of superblock")			\
+	x(read_only,			u8,				\
+	  OPT_FS,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nostart,			u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don\'t start filesystem, only open devices")	\
+	x(reconstruct_alloc,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Reconstruct alloc btree")			\
+	x(version_upgrade,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_STR(bch2_version_upgrade_opts),				\
+	  BCH_SB_VERSION_UPGRADE,	BCH_VERSION_UPGRADE_compatible,	\
+	  NULL,		"Set superblock to latest version,\n"		\
+			"allowing any new features to be used")		\
+	x(buckets_nouse,		u8,				\
+	  0,								\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Allocate the buckets_nouse bitmap")		\
+	x(project,			u8,				\
+	  OPT_INODE,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		NULL)						\
+	x(nocow,			u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,		\
+	  OPT_BOOL(),							\
+	  BCH_SB_NOCOW,			false,				\
+	  NULL,		"Nocow mode: Writes will be done in place when possible.\n"\
+			"Snapshots and reflink will still caused writes to be COW\n"\
+			"Implicitly disables data checksumming, compression and encryption")\
+	x(nocow_enabled,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,			true,			\
+	  NULL,		"Enable nocow mode: enables runtime locking in\n"\
+			"data move path needed if nocow will ever be in use\n")\
+	x(no_data_io,			u8,				\
+	  OPT_MOUNT,							\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Skip submit_bio() for data reads and writes, "	\
+			"for performance testing purposes")		\
+	x(fs_size,			u64,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(bucket,			u32,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, S64_MAX),						\
+	  BCH2_NO_SB_OPT,		0,				\
+	  "size",	"Size of filesystem on device")			\
+	x(durability,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
+	  BCH2_NO_SB_OPT,		1,				\
+	  "n",		"Data written to this device will be considered\n"\
+			"to have already been replicated n times")
+
+struct bch_opts {
+#define x(_name, _bits, ...)	unsigned _name##_defined:1;
+	BCH_OPTS()
+#undef x
+
+#define x(_name, _bits, ...)	_bits	_name;
+	BCH_OPTS()
+#undef x
+};
+
+static const __maybe_unused struct bch_opts bch2_opts_default = {
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)		\
+	._name##_defined = true,					\
+	._name = _default,						\
+
+	BCH_OPTS()
+#undef x
+};
+
+#define opt_defined(_opts, _name)	((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)						\
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)					\
+do {									\
+	(_opts)._name##_defined = true;					\
+	(_opts)._name = _v;						\
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+	return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define x(_name, ...)	Opt_##_name,
+	BCH_OPTS()
+#undef x
+	bch2_opts_nr
+};
+
+struct bch_fs;
+struct printbuf;
+
+struct bch_option {
+	struct attribute	attr;
+	u64			(*get_sb)(const struct bch_sb *);
+	void			(*set_sb)(struct bch_sb *, u64);
+	enum opt_type		type;
+	enum opt_flags		flags;
+	u64			min, max;
+
+	const char * const *choices;
+
+	struct bch_opt_fn	fn;
+
+	const char		*hint;
+	const char		*help;
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
+		   const char *, u64 *, struct printbuf *);
+
+#define OPT_SHOW_FULL_LIST	(1 << 0)
+#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
+
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
+		      const struct bch_option *, u64, unsigned);
+
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+
+/* inode opts: */
+
+struct bch_io_opts {
+#define x(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef x
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
new file mode 100644
index 000000000000..accf246c3233
--- /dev/null
+++ b/fs/bcachefs/printbuf.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: LGPL-2.1+
+/* Copyright (C) 2022 Kent Overstreet */
+
+#include <linux/bitmap.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "printbuf.h"
+
+static inline unsigned printbuf_linelen(struct printbuf *buf)
+{
+	return buf->pos - buf->last_newline;
+}
+
+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra)
+{
+	unsigned new_size;
+	char *buf;
+
+	if (!out->heap_allocated)
+		return 0;
+
+	/* Reserved space for terminating nul: */
+	extra += 1;
+
+	if (out->pos + extra < out->size)
+		return 0;
+
+	new_size = roundup_pow_of_two(out->size + extra);
+
+	/*
+	 * Note: output buffer must be freeable with kfree(), it's not required
+	 * that the user use printbuf_exit().
+	 */
+	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT);
+
+	if (!buf) {
+		out->allocation_failure = true;
+		return -ENOMEM;
+	}
+
+	out->buf	= buf;
+	out->size	= new_size;
+	return 0;
+}
+
+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
+{
+	int len;
+
+	do {
+		va_list args2;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	do {
+		va_start(args, fmt);
+		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
+		va_end(args);
+	} while (len + 1 >= printbuf_remaining(out) &&
+		 !bch2_printbuf_make_room(out, len + 1));
+
+	len = min_t(size_t, len,
+		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
+	out->pos += len;
+}
+
+/**
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:	printbuf to terminate
+ * Returns:	Printbuf contents, as a nul terminated C string
+ */
+const char *bch2_printbuf_str(const struct printbuf *buf)
+{
+	/*
+	 * If we've written to a printbuf then it's guaranteed to be a null
+	 * terminated string - but if we haven't, then we might not have
+	 * allocated a buffer at all:
+	 */
+	return buf->pos
+		? buf->buf
+		: "";
+}
+
+/**
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
+ * against accidental use.
+ * @buf:	printbuf to exit
+ */
+void bch2_printbuf_exit(struct printbuf *buf)
+{
+	if (buf->heap_allocated) {
+		kfree(buf->buf);
+		buf->buf = ERR_PTR(-EINTR); /* poison value */
+	}
+}
+
+void bch2_printbuf_tabstops_reset(struct printbuf *buf)
+{
+	buf->nr_tabstops = 0;
+}
+
+void bch2_printbuf_tabstop_pop(struct printbuf *buf)
+{
+	if (buf->nr_tabstops)
+		--buf->nr_tabstops;
+}
+
+/*
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces from previous tabpstop
+ *
+ * In the future this function may allocate memory if setting more than
+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start
+ * of line.
+ */
+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
+{
+	unsigned prev_tabstop = buf->nr_tabstops
+		? buf->_tabstops[buf->nr_tabstops - 1]
+		: 0;
+
+	if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops)))
+		return -EINVAL;
+
+	buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces;
+	buf->has_indent_or_tabstops = true;
+	return 0;
+}
+
+/**
+ * bch2_printbuf_indent_add() - add to the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces more spaces.
+ */
+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+		spaces = 0;
+
+	buf->indent += spaces;
+	prt_chars(buf, ' ', spaces);
+
+	buf->has_indent_or_tabstops = true;
+}
+
+/**
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to subtract from the current indent level
+ *
+ * Subsequent lines, and the current line if the output position is at the start
+ * of the current line, will be indented by @spaces less spaces.
+ */
+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces)
+{
+	if (WARN_ON_ONCE(spaces > buf->indent))
+		spaces = buf->indent;
+
+	if (buf->last_newline + buf->indent == buf->pos) {
+		buf->pos -= spaces;
+		printbuf_nul_terminate(buf);
+	}
+	buf->indent -= spaces;
+
+	if (!buf->indent && !buf->nr_tabstops)
+		buf->has_indent_or_tabstops = false;
+}
+
+void bch2_prt_newline(struct printbuf *buf)
+{
+	unsigned i;
+
+	bch2_printbuf_make_room(buf, 1 + buf->indent);
+
+	__prt_char(buf, '\n');
+
+	buf->last_newline	= buf->pos;
+
+	for (i = 0; i < buf->indent; i++)
+		__prt_char(buf, ' ');
+
+	printbuf_nul_terminate(buf);
+
+	buf->last_field		= buf->pos;
+	buf->cur_tabstop	= 0;
+}
+
+/*
+ * Returns spaces from start of line, if set, or 0 if unset:
+ */
+static inline unsigned cur_tabstop(struct printbuf *buf)
+{
+	return buf->cur_tabstop < buf->nr_tabstops
+		? buf->_tabstops[buf->cur_tabstop]
+		: 0;
+}
+
+static void __prt_tab(struct printbuf *out)
+{
+	int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
+
+	prt_chars(out, ' ', spaces);
+
+	out->last_field = out->pos;
+	out->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:	printbuf to control
+ *
+ * Advance output to the next tabstop by printing spaces.
+ */
+void bch2_prt_tab(struct printbuf *out)
+{
+	if (WARN_ON(!cur_tabstop(out)))
+		return;
+
+	__prt_tab(out);
+}
+
+static void __prt_tab_rjust(struct printbuf *buf)
+{
+	unsigned move = buf->pos - buf->last_field;
+	int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf);
+
+	if (pad > 0) {
+		bch2_printbuf_make_room(buf, pad);
+
+		if (buf->last_field + pad < buf->size)
+			memmove(buf->buf + buf->last_field + pad,
+				buf->buf + buf->last_field,
+				min(move, buf->size - 1 - buf->last_field - pad));
+
+		if (buf->last_field < buf->size)
+			memset(buf->buf + buf->last_field, ' ',
+			       min((unsigned) pad, buf->size - buf->last_field));
+
+		buf->pos += pad;
+		printbuf_nul_terminate(buf);
+	}
+
+	buf->last_field = buf->pos;
+	buf->cur_tabstop++;
+}
+
+/**
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * previous output
+ *
+ * @buf: printbuf to control
+ *
+ * Advance output to the next tabstop by inserting spaces immediately after the
+ * previous tabstop, right justifying previously outputted text.
+ */
+void bch2_prt_tab_rjust(struct printbuf *buf)
+{
+	if (WARN_ON(!cur_tabstop(buf)))
+		return;
+
+	__prt_tab_rjust(buf);
+}
+
+/**
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
+ *
+ * @out:	output printbuf
+ * @str:	string to print
+ * @count:	number of bytes to print
+ *
+ * The following contol characters are handled as so:
+ *   \n: prt_newline	newline that obeys current indent level
+ *   \t: prt_tab	advance to next tabstop
+ *   \r: prt_tab_rjust	advance to next tabstop, with right justification
+ */
+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count)
+{
+	const char *unprinted_start = str;
+	const char *end = str + count;
+
+	if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) {
+		prt_bytes(out, str, count);
+		return;
+	}
+
+	while (str != end) {
+		switch (*str) {
+		case '\n':
+			prt_bytes(out, unprinted_start, str - unprinted_start);
+			unprinted_start = str + 1;
+			bch2_prt_newline(out);
+			break;
+		case '\t':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab(out);
+			}
+			break;
+		case '\r':
+			if (likely(cur_tabstop(out))) {
+				prt_bytes(out, unprinted_start, str - unprinted_start);
+				unprinted_start = str + 1;
+				__prt_tab_rjust(out);
+			}
+			break;
+		}
+
+		str++;
+	}
+
+	prt_bytes(out, unprinted_start, str - unprinted_start);
+}
+
+/**
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
+{
+	bch2_printbuf_make_room(out, 10);
+	out->pos += string_get_size(v, 1, !out->si_units,
+				    out->buf + out->pos,
+				    printbuf_remaining_size(out));
+}
+
+/**
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
+ */
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_human_readable_u64(out, abs(v));
+}
+
+/**
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_u64(struct printbuf *out, u64 v)
+{
+	if (out->human_readable_units)
+		bch2_prt_human_readable_u64(out, v);
+	else
+		bch2_prt_printf(out, "%llu", v);
+}
+
+/**
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:	output printbuf
+ * @v:		integer to print
+ *
+ * Units are either raw (default), or human reabable units (controlled via
+ * @buf->human_readable_units)
+ */
+void bch2_prt_units_s64(struct printbuf *out, s64 v)
+{
+	if (v < 0)
+		prt_char(out, '-');
+	bch2_prt_units_u64(out, abs(v));
+}
+
+void bch2_prt_string_option(struct printbuf *out,
+			    const char * const list[],
+			    size_t selected)
+{
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]);
+}
+
+void bch2_prt_bitflags(struct printbuf *out,
+		       const char * const list[], u64 flags)
+{
+	unsigned bit, nr = 0;
+	bool first = true;
+
+	while (list[nr])
+		nr++;
+
+	while (flags && (bit = __ffs64(flags)) < nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[bit]);
+		flags ^= BIT_ULL(bit);
+	}
+}
+
+void bch2_prt_bitflags_vector(struct printbuf *out,
+			      const char * const list[],
+			      unsigned long *v, unsigned nr)
+{
+	bool first = true;
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		if (!list[i]) {
+			nr = i - 1;
+			break;
+		}
+
+	for_each_set_bit(i, v, nr) {
+		if (!first)
+			bch2_prt_printf(out, ",");
+		first = false;
+		bch2_prt_printf(out, "%s", list[i]);
+	}
+}
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
new file mode 100644
index 000000000000..9a4a56c40937
--- /dev/null
+++ b/fs/bcachefs/printbuf.h
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/* Copyright (C) 2022 Kent Overstreet */
+
+#ifndef _BCACHEFS_PRINTBUF_H
+#define _BCACHEFS_PRINTBUF_H
+
+/*
+ * Printbufs: Simple strings for printing to, with optional heap allocation
+ *
+ * This code has provisions for use in userspace, to aid in making other code
+ * portable between kernelspace and userspace.
+ *
+ * Basic example:
+ *   struct printbuf buf = PRINTBUF;
+ *
+ *   prt_printf(&buf, "foo=");
+ *   foo_to_text(&buf, foo);
+ *   printk("%s", buf.buf);
+ *   printbuf_exit(&buf);
+ *
+ * Or
+ *   struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size)
+ *
+ * We can now write pretty printers instead of writing code that dumps
+ * everything to the kernel log buffer, and then those pretty-printers can be
+ * used by other code that outputs to kernel log, sysfs, debugfs, etc.
+ *
+ * Memory allocation: Outputing to a printbuf may allocate memory. This
+ * allocation is done with GFP_KERNEL, by default: use the newer
+ * memalloc_*_(save|restore) functions as needed.
+ *
+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations
+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero.
+ *
+ * It's allowed to grab the output buffer and free it later with kfree() instead
+ * of using printbuf_exit(), if the user just needs a heap allocated string at
+ * the end.
+ *
+ * Memory allocation failures: We don't return errors directly, because on
+ * memory allocation failure we usually don't want to bail out and unwind - we
+ * want to print what we've got, on a best-effort basis. But code that does want
+ * to return -ENOMEM may check printbuf.allocation_failure.
+ *
+ * Indenting, tabstops:
+ *
+ * To aid is writing multi-line pretty printers spread across multiple
+ * functions, printbufs track the current indent level.
+ *
+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent
+ * level, respectively.
+ *
+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from
+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop.
+ * prt_tab_rjust() will also advance the current line of text up to the next
+ * tabstop, but it does so by shifting text since the previous tabstop up to the
+ * next tabstop - right justifying it.
+ *
+ * Make sure you use prt_newline() instead of \n in the format string for indent
+ * level and tabstops to work corretly.
+ *
+ * Output units: printbuf->units exists to tell pretty-printers how to output
+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as
+ * human readable bytes. prt_units() obeys it.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+enum printbuf_si {
+	PRINTBUF_UNITS_2,	/* use binary powers of 2^10 */
+	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
+};
+
+#define PRINTBUF_INLINE_TABSTOPS	6
+
+struct printbuf {
+	char			*buf;
+	unsigned		size;
+	unsigned		pos;
+	unsigned		last_newline;
+	unsigned		last_field;
+	unsigned		indent;
+	/*
+	 * If nonzero, allocations will be done with GFP_ATOMIC:
+	 */
+	u8			atomic;
+	bool			allocation_failure:1;
+	bool			heap_allocated:1;
+	enum printbuf_si	si_units:1;
+	bool			human_readable_units:1;
+	bool			has_indent_or_tabstops:1;
+	bool			suppress_indent_tabstop_handling:1;
+	u8			nr_tabstops;
+
+	/*
+	 * Do not modify directly: use printbuf_tabstop_add(),
+	 * printbuf_tabstop_get()
+	 */
+	u8			cur_tabstop;
+	u8			_tabstops[PRINTBUF_INLINE_TABSTOPS];
+};
+
+int bch2_printbuf_make_room(struct printbuf *, unsigned);
+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...);
+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list);
+const char *bch2_printbuf_str(const struct printbuf *);
+void bch2_printbuf_exit(struct printbuf *);
+
+void bch2_printbuf_tabstops_reset(struct printbuf *);
+void bch2_printbuf_tabstop_pop(struct printbuf *);
+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
+
+void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
+
+void bch2_prt_newline(struct printbuf *);
+void bch2_prt_tab(struct printbuf *);
+void bch2_prt_tab_rjust(struct printbuf *);
+
+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned);
+void bch2_prt_human_readable_u64(struct printbuf *, u64);
+void bch2_prt_human_readable_s64(struct printbuf *, s64);
+void bch2_prt_units_u64(struct printbuf *, u64);
+void bch2_prt_units_s64(struct printbuf *, s64);
+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t);
+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64);
+void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
+			      unsigned long *, unsigned);
+
+/* Initializer for a heap allocated printbuf: */
+#define PRINTBUF ((struct printbuf) { .heap_allocated = true })
+
+/* Initializer a printbuf that points to an external buffer: */
+#define PRINTBUF_EXTERN(_buf, _size)			\
+((struct printbuf) {					\
+	.buf	= _buf,					\
+	.size	= _size,				\
+})
+
+/*
+ * Returns size remaining of output buffer:
+ */
+static inline unsigned printbuf_remaining_size(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos : 0;
+}
+
+/*
+ * Returns number of characters we can print to the output buffer - i.e.
+ * excluding the terminating nul:
+ */
+static inline unsigned printbuf_remaining(struct printbuf *out)
+{
+	return out->pos < out->size ? out->size - out->pos - 1 : 0;
+}
+
+static inline unsigned printbuf_written(struct printbuf *out)
+{
+	return out->size ? min(out->pos, out->size - 1) : 0;
+}
+
+/*
+ * Returns true if output was truncated:
+ */
+static inline bool printbuf_overflowed(struct printbuf *out)
+{
+	return out->pos >= out->size;
+}
+
+static inline void printbuf_nul_terminate(struct printbuf *out)
+{
+	bch2_printbuf_make_room(out, 1);
+
+	if (out->pos < out->size)
+		out->buf[out->pos] = 0;
+	else if (out->size)
+		out->buf[out->size - 1] = 0;
+}
+
+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */
+static inline void __prt_char_reserved(struct printbuf *out, char c)
+{
+	if (printbuf_remaining(out))
+		out->buf[out->pos] = c;
+	out->pos++;
+}
+
+/* Doesn't nul terminate: */
+static inline void __prt_char(struct printbuf *out, char c)
+{
+	bch2_printbuf_make_room(out, 1);
+	__prt_char_reserved(out, c);
+}
+
+static inline void prt_char(struct printbuf *out, char c)
+{
+	__prt_char(out, c);
+	printbuf_nul_terminate(out);
+}
+
+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n)
+{
+	unsigned i, can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = c;
+	out->pos += n - can_print;
+}
+
+static inline void prt_chars(struct printbuf *out, char c, unsigned n)
+{
+	bch2_printbuf_make_room(out, n);
+	__prt_chars_reserved(out, c, n);
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n)
+{
+	unsigned i, can_print;
+
+	bch2_printbuf_make_room(out, n);
+
+	can_print = min(n, printbuf_remaining(out));
+
+	for (i = 0; i < can_print; i++)
+		out->buf[out->pos++] = ((char *) b)[i];
+	out->pos += n - can_print;
+
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_str(struct printbuf *out, const char *str)
+{
+	prt_bytes(out, str, strlen(str));
+}
+
+static inline void prt_str_indented(struct printbuf *out, const char *str)
+{
+	bch2_prt_bytes_indented(out, str, strlen(str));
+}
+
+static inline void prt_hex_byte(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_hi(byte));
+	__prt_char_reserved(out, hex_asc_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte)
+{
+	bch2_printbuf_make_room(out, 2);
+	__prt_char_reserved(out, hex_asc_upper_hi(byte));
+	__prt_char_reserved(out, hex_asc_upper_lo(byte));
+	printbuf_nul_terminate(out);
+}
+
+/**
+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it:
+ */
+static inline void printbuf_reset(struct printbuf *buf)
+{
+	buf->pos		= 0;
+	buf->allocation_failure	= 0;
+	buf->indent		= 0;
+	buf->nr_tabstops	= 0;
+	buf->cur_tabstop	= 0;
+}
+
+/**
+ * printbuf_atomic_inc - mark as entering an atomic section
+ */
+static inline void printbuf_atomic_inc(struct printbuf *buf)
+{
+	buf->atomic++;
+}
+
+/**
+ * printbuf_atomic_inc - mark as leaving an atomic section
+ */
+static inline void printbuf_atomic_dec(struct printbuf *buf)
+{
+	buf->atomic--;
+}
+
+#endif /* _BCACHEFS_PRINTBUF_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
new file mode 100644
index 000000000000..a54647c36b85
--- /dev/null
+++ b/fs/bcachefs/quota.c
@@ -0,0 +1,979 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "quota.h"
+#include "snapshot.h"
+#include "super-io.h"
+
+static const char * const bch2_quota_types[] = {
+	"user",
+	"group",
+	"project",
+};
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) < sizeof(*q)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&q->field), sizeof(*q));
+		return -BCH_ERR_invalid_sb_quota;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+	unsigned qtyp, counter;
+
+	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
+		prt_printf(out, "%s: flags %llx",
+		       bch2_quota_types[qtyp],
+		       le64_to_cpu(q->q[qtyp].flags));
+
+		for (counter = 0; counter < Q_COUNTERS; counter++)
+			prt_printf(out, " %s timelimit %u warnlimit %u",
+			       bch2_quota_counters[counter],
+			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
+			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
+
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_quota_validate,
+	.to_text	= bch2_sb_quota_to_text,
+};
+
+int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
+			 quota_type_invalid,
+			 "invalid quota type (%llu >= %u)",
+			 k.k->p.inode, QTYP_NR);
+fsck_err:
+	return ret;
+}
+
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
+	unsigned i;
+
+	for (i = 0; i < Q_COUNTERS; i++)
+		prt_printf(out, "%s hardlimit %llu softlimit %llu",
+		       bch2_quota_counters[i],
+		       le64_to_cpu(dq.v->c[i].hardlimit),
+		       le64_to_cpu(dq.v->c[i].softlimit));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "i_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", i->i_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "i_flags");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_flags);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_timelimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_timelimit);
+	prt_newline(out);
+
+	prt_str(out, "i_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_spc_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_ino_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_ino_warnlimit);
+	prt_newline(out);
+
+	prt_str(out, "i_rt_spc_warnlimit");
+	prt_tab(out);
+	prt_printf(out, "%u", i->i_rt_spc_warnlimit);
+	prt_newline(out);
+}
+
+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
+{
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 20);
+
+	prt_str(out, "d_fieldmask");
+	prt_tab(out);
+	prt_printf(out, "%x", q->d_fieldmask);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_hardlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_hardlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_softlimit");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_softlimit);
+	prt_newline(out);
+
+	prt_str(out, "d_space");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_space);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_count");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_count);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_ino_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_timer");
+	prt_tab(out);
+	prt_printf(out, "%llu", q->d_spc_timer);
+	prt_newline(out);
+
+	prt_str(out, "d_ino_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_ino_warns);
+	prt_newline(out);
+
+	prt_str(out, "d_spc_warns");
+	prt_tab(out);
+	prt_printf(out, "%i", q->d_spc_warns);
+	prt_newline(out);
+}
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == KEY_TYPE_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+		return -EDQUOT;
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n) {
+		if (qc->timer == 0) {
+			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
+			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+		} else if (ktime_get_real_seconds() >= qc->timer &&
+			   !ignore_hardlimit(q)) {
+			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+			return -EDQUOT;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL);
+		if (!mq[i])
+			return -ENOMEM;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space,
+			enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL);
+		if (!src_q[i] || !dst_q[i])
+			return -ENOMEM;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     mode);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
+			    struct qc_dqblk *qdq)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	if (!((1U << k.k->p.inode) & enabled_qtypes(c)))
+		return 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_quota:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
+			mq->c[Q_SPC].timer	= qdq->d_spc_timer;
+		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
+			mq->c[Q_SPC].warns	= qdq->d_spc_warns;
+		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
+			mq->c[Q_INO].timer	= qdq->d_ino_timer;
+		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
+			mq->c[Q_INO].warns	= qdq->d_ino_warns;
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
+{
+	struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
+
+	if (sb_quota)
+		return sb_quota;
+
+	sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
+	if (sb_quota) {
+		unsigned qtype, qc;
+
+		for (qtype = 0; qtype < QTYP_NR; qtype++)
+			for (qc = 0; qc < Q_COUNTERS; qc++)
+				sb_quota->q[qtype].c[qc].timelimit =
+					cpu_to_le32(7 * 24 * 60 * 60);
+	}
+
+	return sb_quota;
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked u;
+	struct bch_snapshot_tree s_t;
+	int ret;
+
+	ret = bch2_snapshot_tree_lookup(trans,
+			bch2_snapshot_tree(c, k.k->p.snapshot), &s_t);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+			"%s: snapshot tree %u not found", __func__,
+			snapshot_t(c, k.k->p.snapshot)->tree);
+	if (ret)
+		return ret;
+
+	if (!s_t.master_subvol)
+		goto advance;
+
+	ret = bch2_inode_find_by_inum_nowarn_trans(trans,
+				(subvol_inum) {
+					le32_to_cpu(s_t.master_subvol),
+					k.k->p.offset,
+				}, &u);
+	/*
+	 * Inode might be deleted in this snapshot - the easiest way to handle
+	 * that is to just skip it here:
+	 */
+	if (bch2_err_matches(ret, ENOENT))
+		goto advance;
+
+	if (ret)
+		return ret;
+
+	bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+			KEY_TYPE_QUOTA_NOCHECK);
+	bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+			KEY_TYPE_QUOTA_NOCHECK);
+advance:
+	bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+	return 0;
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		mutex_unlock(&c->sb_lock);
+		return -BCH_ERR_ENOSPC_sb_quota;
+	}
+
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	trans = bch2_trans_get(c);
+
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
+			POS_MIN, BTREE_ITER_PREFETCH, k,
+		__bch2_quota_set(c, k, NULL)) ?:
+	      for_each_btree_key2(trans, iter, BTREE_ID_inodes,
+			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+		bch2_fs_quota_read_inode(trans, &iter, k));
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+	bch2_write_super(c);
+unlock:
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ, U64_MAX),
+					      0, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	int ret = 0;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_info_to_text(&buf, info);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+	if (!sb_quota) {
+		ret = -BCH_ERR_ENOSPC_sb_quota;
+		goto unlock;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+unlock:
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_err_class(ret);
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter;
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	genradix_for_each_from(&q->table, iter, mq, qid)
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return bch2_err_class(ret);
+}
+
+static int bch2_set_quota_trans(struct btree_trans *trans,
+				struct bkey_i_quota *new_quota,
+				struct qc_dqblk *qdq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+			       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	ret = bkey_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	if (k.k->type == KEY_TYPE_quota)
+		new_quota->v = *bkey_s_c_to_quota(k).v;
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (0) {
+		struct printbuf buf = PRINTBUF;
+
+		qc_dqblk_to_text(&buf, qdq);
+		pr_info("setting:\n%s", buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+			    bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
+
+	return bch2_err_class(ret);
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
new file mode 100644
index 000000000000..884f601f41c4
--- /dev/null
+++ b/fs/bcachefs/quota.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+enum bkey_invalid_flags;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+	.min_val_size	= 32,				\
+})
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
+	};
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64, enum quota_acct_mode);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+				  enum quota_counters counter, s64 v,
+				  enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+				      struct bch_qid dst,
+				      struct bch_qid src, u64 space,
+				      enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
new file mode 100644
index 000000000000..6a136083d389
--- /dev/null
+++ b/fs/bcachefs/quota_types.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+enum quota_acct_mode {
+	KEY_TYPE_QUOTA_PREALLOC,
+	KEY_TYPE_QUOTA_WARN,
+	KEY_TYPE_QUOTA_NOCHECK,
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
new file mode 100644
index 000000000000..3319190b8d9c
--- /dev/null
+++ b/fs/bcachefs/rebalance.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "clock.h"
+#include "compress.h"
+#include "disk_groups.h"
+#include "errcode.h"
+#include "error.h"
+#include "inode.h"
+#include "move.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+#define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
+
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+	BCH_REBALANCE_STATES()
+	NULL
+#undef x
+};
+
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie *cookie;
+	u64 v;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	v = k.k->type == KEY_TYPE_cookie
+		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+		: 0;
+
+	cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+	ret = PTR_ERR_OR_ZERO(cookie);
+	if (ret)
+		goto err;
+
+	bkey_cookie_init(&cookie->k_i);
+	cookie->k.p = iter.pos;
+	cookie->v.cookie = cpu_to_le64(v + 1);
+
+	ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+{
+	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+			    __bch2_set_rebalance_needs_scan(trans, inum));
+	rebalance_wakeup(c);
+	return ret;
+}
+
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+	return bch2_set_rebalance_needs_scan(c, 0);
+}
+
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 v;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	v = k.k->type == KEY_TYPE_cookie
+		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+		: 0;
+
+	if (v == cookie)
+		ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+					    struct btree_iter *work_iter)
+{
+	return !kthread_should_stop()
+		? bch2_btree_iter_peek(work_iter)
+		: bkey_s_c_null;
+}
+
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+					   struct btree_iter *iter,
+					   struct bkey_s_c k)
+{
+	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	extent_entry_drop(bkey_i_to_s(n),
+			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+	return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+			struct bpos work_pos,
+			struct btree_iter *extent_iter,
+			struct data_update_opts *data_opts)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+
+	bch2_trans_iter_exit(trans, extent_iter);
+	bch2_trans_iter_init(trans, extent_iter,
+			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+			     work_pos,
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	k = bch2_btree_iter_peek_slot(extent_iter);
+	if (bkey_err(k))
+		return k;
+
+	const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+	if (!r) {
+		/* raced due to btree write buffer, nothing to do */
+		return bkey_s_c_null;
+	}
+
+	memset(data_opts, 0, sizeof(*data_opts));
+
+	data_opts->rewrite_ptrs		=
+		bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+	data_opts->target		= r->target;
+
+	if (!data_opts->rewrite_ptrs) {
+		/*
+		 * device we would want to write to offline? devices in target
+		 * changed?
+		 *
+		 * We'll now need a full scan before this extent is picked up
+		 * again:
+		 */
+		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+		if (ret)
+			return bkey_s_c_err(ret);
+		return bkey_s_c_null;
+	}
+
+	return k;
+}
+
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+			       struct bpos work_pos,
+			       struct btree_iter *extent_iter)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bch_fs_rebalance *r = &trans->c->rebalance;
+	struct data_update_opts data_opts;
+	struct bch_io_opts io_opts;
+	struct bkey_s_c k;
+	struct bkey_buf sk;
+	int ret;
+
+	ctxt->stats = &r->work_stats;
+	r->state = BCH_REBALANCE_working;
+
+	bch2_bkey_buf_init(&sk);
+
+	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+						 extent_iter, &data_opts));
+	if (ret || !k.k)
+		goto out;
+
+	ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+	if (ret)
+		goto out;
+
+	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+	/*
+	 * The iterator gets unlocked by __bch2_read_extent - need to
+	 * save a copy of @k elsewhere:
+	 */
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+	if (ret) {
+		if (bch2_err_matches(ret, ENOMEM)) {
+			/* memory allocation failure, wait for some IO to finish */
+			bch2_move_ctxt_wait_for_io(ctxt);
+			ret = -BCH_ERR_transaction_restart_nested;
+		}
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			goto out;
+
+		/* skip it and continue, XXX signal failure */
+		ret = 0;
+	}
+out:
+	bch2_bkey_buf_exit(&sk, c);
+	return ret;
+}
+
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+			   struct bkey_s_c k,
+			   struct bch_io_opts *io_opts,
+			   struct data_update_opts *data_opts)
+{
+	unsigned target, compression;
+
+	if (k.k->p.inode) {
+		target		= io_opts->background_target;
+		compression	= io_opts->background_compression ?: io_opts->compression;
+	} else {
+		const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+		target		= r ? r->target : io_opts->background_target;
+		compression	= r ? r->compression :
+			(io_opts->background_compression ?: io_opts->compression);
+	}
+
+	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+	data_opts->target		= target;
+	return data_opts->rewrite_ptrs != 0;
+}
+
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs_rebalance *r = &trans->c->rebalance;
+	int ret;
+
+	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+	ctxt->stats = &r->scan_stats;
+
+	if (!inum) {
+		r->scan_start	= BBPOS_MIN;
+		r->scan_end	= BBPOS_MAX;
+	} else {
+		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
+		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+	}
+
+	r->state = BCH_REBALANCE_scanning;
+
+	ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+		commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			  bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+	bch2_move_stats_exit(&r->scan_stats, trans->c);
+	return ret;
+}
+
+static void rebalance_wait(struct bch_fs *c)
+{
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	u64 now = atomic64_read(&clock->now);
+	u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+	if (min_member_capacity == U64_MAX)
+		min_member_capacity = 128 * 2048;
+
+	r->wait_iotime_end		= now + (min_member_capacity >> 6);
+
+	if (r->state != BCH_REBALANCE_waiting) {
+		r->wait_iotime_start	= now;
+		r->wait_wallclock_start	= ktime_get_real_ns();
+		r->state		= BCH_REBALANCE_waiting;
+	}
+
+	bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
+
+static int do_rebalance(struct moving_context *ctxt)
+{
+	struct btree_trans *trans = ctxt->trans;
+	struct bch_fs *c = trans->c;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_move_stats_init(&r->work_stats, "rebalance_work");
+	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+
+	bch2_trans_iter_init(trans, &rebalance_work_iter,
+			     BTREE_ID_rebalance_work, POS_MIN,
+			     BTREE_ITER_ALL_SNAPSHOTS);
+
+	while (!bch2_move_ratelimit(ctxt) &&
+	       !kthread_wait_freezable(r->enabled)) {
+		bch2_trans_begin(trans);
+
+		ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret || !k.k)
+			break;
+
+		ret = k.k->type == KEY_TYPE_cookie
+			? do_rebalance_scan(ctxt, k.k->p.inode,
+					    le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+			: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			continue;
+		if (ret)
+			break;
+
+		bch2_btree_iter_advance(&rebalance_work_iter);
+	}
+
+	bch2_trans_iter_exit(trans, &extent_iter);
+	bch2_trans_iter_exit(trans, &rebalance_work_iter);
+	bch2_move_stats_exit(&r->scan_stats, c);
+
+	if (!ret &&
+	    !kthread_should_stop() &&
+	    !atomic64_read(&r->work_stats.sectors_seen) &&
+	    !atomic64_read(&r->scan_stats.sectors_seen)) {
+		bch2_trans_unlock_long(trans);
+		rebalance_wait(c);
+	}
+
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct moving_context ctxt;
+	int ret;
+
+	set_freezable();
+
+	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+			      writepoint_ptr(&c->rebalance_write_point),
+			      true);
+
+	while (!kthread_should_stop() &&
+	       !(ret = do_rebalance(&ctxt)))
+		;
+
+	bch2_moving_ctxt_exit(&ctxt);
+
+	return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct bch_fs_rebalance *r = &c->rebalance;
+
+	prt_str(out, bch2_rebalance_state_strs[r->state]);
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	switch (r->state) {
+	case BCH_REBALANCE_waiting: {
+		u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+		prt_str(out, "io wait duration:  ");
+		bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+		prt_newline(out);
+
+		prt_str(out, "io wait remaining: ");
+		bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+		prt_newline(out);
+
+		prt_str(out, "duration waited:   ");
+		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+		prt_newline(out);
+		break;
+	}
+	case BCH_REBALANCE_working:
+		bch2_move_stats_to_text(out, &r->work_stats);
+		break;
+	case BCH_REBALANCE_scanning:
+		bch2_move_stats_to_text(out, &r->scan_stats);
+		break;
+	}
+	prt_newline(out);
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+	int ret;
+
+	if (c->rebalance.thread)
+		return 0;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+	ret = PTR_ERR_OR_ZERO(p);
+	if (ret) {
+		bch_err_msg(c, ret, "creating rebalance thread");
+		return ret;
+	}
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
new file mode 100644
index 000000000000..28a52638f16c
--- /dev/null
+++ b/fs/bcachefs/rebalance.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(c->rebalance.thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
new file mode 100644
index 000000000000..0fffb536c1d0
--- /dev/null
+++ b/fs/bcachefs/rebalance_types.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "bbpos_types.h"
+#include "move_types.h"
+
+#define BCH_REBALANCE_STATES()		\
+	x(waiting)			\
+	x(working)			\
+	x(scanning)
+
+enum bch_rebalance_states {
+#define x(t)	BCH_REBALANCE_##t,
+	BCH_REBALANCE_STATES()
+#undef x
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu	*thread;
+	struct bch_pd_controller pd;
+
+	enum bch_rebalance_states	state;
+	u64				wait_iotime_start;
+	u64				wait_iotime_end;
+	u64				wait_wallclock_start;
+
+	struct bch_move_stats		work_stats;
+
+	struct bbpos			scan_start;
+	struct bbpos			scan_end;
+	struct bch_move_stats		scan_stats;
+
+	unsigned			enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
new file mode 100644
index 000000000000..5cf7d0532002
--- /dev/null
+++ b/fs/bcachefs/recovery.c
@@ -0,0 +1,1157 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "backpointers.h"
+#include "bkey_buf.h"
+#include "alloc_background.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs-common.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "move.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-downgrade.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/* for -o reconstruct_alloc: */
+static void drop_alloc_keys(struct journal_keys *keys)
+{
+	size_t src, dst;
+
+	for (src = 0, dst = 0; src < keys->nr; src++)
+		if (!btree_id_is_alloc(keys->d[src].btree_id))
+			keys->d[dst++] = keys->d[src];
+
+	keys->nr = dst;
+}
+
+/*
+ * Btree node pointers have a field to stack a pointer to the in memory btree
+ * node; we need to zero out this field when reading in btree nodes, or when
+ * reading in keys from the journal:
+ */
+static void zero_out_btree_mem_ptr(struct journal_keys *keys)
+{
+	struct journal_key *i;
+
+	for (i = keys->d; i < keys->d + keys->nr; i++)
+		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
+			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
+}
+
+/* journal replay: */
+
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+
+	seq = min(seq, j->replay_journal_seq_end);
+
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
+
+static int bch2_journal_replay_key(struct btree_trans *trans,
+				   struct journal_key *k)
+{
+	struct btree_iter iter;
+	unsigned iter_flags =
+		BTREE_ITER_INTENT|
+		BTREE_ITER_NOT_EXTENTS;
+	unsigned update_flags = BTREE_TRIGGER_NORUN;
+	int ret;
+
+	/*
+	 * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
+	 * keep the key cache coherent with the underlying btree. Nothing
+	 * besides the allocator is doing updates yet so we don't need key cache
+	 * coherency for non-alloc btrees, and key cache fills for snapshots
+	 * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
+	 * the snapshots recovery pass runs.
+	 */
+	if (!k->level && k->btree_id == BTREE_ID_alloc)
+		iter_flags |= BTREE_ITER_CACHED;
+	else
+		update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
+
+	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+				  BTREE_MAX_DEPTH, k->level,
+				  iter_flags);
+	ret = bch2_btree_iter_traverse(&iter);
+	if (ret)
+		goto out;
+
+	/* Must be checked with btree locked: */
+	if (k->overwritten)
+		goto out;
+
+	ret = bch2_trans_update(trans, &iter, k->k, update_flags);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
+{
+	const struct journal_key *l = *((const struct journal_key **)_l);
+	const struct journal_key *r = *((const struct journal_key **)_r);
+
+	return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_journal_replay(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	struct journal_key **keys_sorted, *k;
+	struct journal *j = &c->journal;
+	u64 start_seq	= c->journal_replay_seq_start;
+	u64 end_seq	= c->journal_replay_seq_start;
+	size_t i;
+	int ret = 0;
+
+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+	keys->gap = keys->nr;
+
+	keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
+	if (!keys_sorted)
+		return -BCH_ERR_ENOMEM_journal_replay;
+
+	for (i = 0; i < keys->nr; i++)
+		keys_sorted[i] = &keys->d[i];
+
+	sort(keys_sorted, keys->nr,
+	     sizeof(keys_sorted[0]),
+	     journal_sort_seq_cmp, NULL);
+
+	if (keys->nr) {
+		ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
+					   keys->nr, start_seq, end_seq);
+		if (ret)
+			goto err;
+	}
+
+	BUG_ON(!atomic_read(&keys->ref));
+
+	for (i = 0; i < keys->nr; i++) {
+		k = keys_sorted[i];
+
+		cond_resched();
+
+		replay_now_at(j, k->journal_seq);
+
+		ret = bch2_trans_do(c, NULL, NULL,
+				    BTREE_INSERT_LAZY_RW|
+				    BTREE_INSERT_NOFAIL|
+				    (!k->allocated
+				     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
+				     : 0),
+			     bch2_journal_replay_key(trans, k));
+		if (ret) {
+			bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
+				bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+			goto err;
+		}
+	}
+
+	if (!c->opts.keep_journal)
+		bch2_journal_keys_put_initial(c);
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	ret = bch2_journal_error(j);
+
+	if (keys->nr && !ret)
+		bch2_journal_log_msg(c, "journal replay finished");
+err:
+	kvfree(keys_sorted);
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* journal replay early: */
+
+static int journal_replay_entry_early(struct bch_fs *c,
+				      struct jset_entry *entry)
+{
+	int ret = 0;
+
+	switch (entry->type) {
+	case BCH_JSET_ENTRY_btree_root: {
+		struct btree_root *r;
+
+		while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) {
+			ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL });
+			if (ret)
+				return ret;
+		}
+
+		r = bch2_btree_id_root(c, entry->btree_id);
+
+		if (entry->u64s) {
+			r->level = entry->level;
+			bkey_copy(&r->key, (struct bkey_i *) entry->start);
+			r->error = 0;
+		} else {
+			r->error = -EIO;
+		}
+		r->alive = true;
+		break;
+	}
+	case BCH_JSET_ENTRY_usage: {
+		struct jset_entry_usage *u =
+			container_of(entry, struct jset_entry_usage, entry);
+
+		switch (entry->btree_id) {
+		case BCH_FS_USAGE_reserved:
+			if (entry->level < BCH_REPLICAS_MAX)
+				c->usage_base->persistent_reserved[entry->level] =
+					le64_to_cpu(u->v);
+			break;
+		case BCH_FS_USAGE_inodes:
+			c->usage_base->nr_inodes = le64_to_cpu(u->v);
+			break;
+		case BCH_FS_USAGE_key_version:
+			atomic64_set(&c->key_version,
+				     le64_to_cpu(u->v));
+			break;
+		}
+
+		break;
+	}
+	case BCH_JSET_ENTRY_data_usage: {
+		struct jset_entry_data_usage *u =
+			container_of(entry, struct jset_entry_data_usage, entry);
+
+		ret = bch2_replicas_set_usage(c, &u->r,
+					      le64_to_cpu(u->v));
+		break;
+	}
+	case BCH_JSET_ENTRY_dev_usage: {
+		struct jset_entry_dev_usage *u =
+			container_of(entry, struct jset_entry_dev_usage, entry);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
+		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
+
+		for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
+			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
+			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
+			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
+		}
+
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist: {
+		struct jset_entry_blacklist *bl_entry =
+			container_of(entry, struct jset_entry_blacklist, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->seq),
+				le64_to_cpu(bl_entry->seq) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_blacklist_v2: {
+		struct jset_entry_blacklist_v2 *bl_entry =
+			container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+		ret = bch2_journal_seq_blacklist_add(c,
+				le64_to_cpu(bl_entry->start),
+				le64_to_cpu(bl_entry->end) + 1);
+		break;
+	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
+	}
+	}
+
+	return ret;
+}
+
+static int journal_replay_early(struct bch_fs *c,
+				struct bch_sb_field_clean *clean)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	if (clean) {
+		for (entry = clean->start;
+		     entry != vstruct_end(&clean->field);
+		     entry = vstruct_next(entry)) {
+			ret = journal_replay_entry_early(c, entry);
+			if (ret)
+				return ret;
+		}
+	} else {
+		struct genradix_iter iter;
+		struct journal_replay *i, **_i;
+
+		genradix_for_each(&c->journal_entries, iter, _i) {
+			i = *_i;
+
+			if (!i || i->ignore)
+				continue;
+
+			vstruct_for_each(&i->j, entry) {
+				ret = journal_replay_entry_early(c, entry);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	bch2_fs_usage_initialize(c);
+
+	return 0;
+}
+
+/* sb clean section: */
+
+static int read_btree_roots(struct bch_fs *c)
+{
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < btree_id_nr_alive(c); i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->alive)
+			continue;
+
+		if (btree_id_is_alloc(i) &&
+		    c->opts.reconstruct_alloc) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			continue;
+		}
+
+		if (r->error) {
+			__fsck_err(c,
+				   btree_id_is_alloc(i)
+				   ? FSCK_CAN_IGNORE : 0,
+				   btree_root_bkey_invalid,
+				   "invalid btree root %s",
+				   bch2_btree_id_str(i));
+			if (i == BTREE_ID_alloc)
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		}
+
+		ret = bch2_btree_root_read(c, i, &r->key, r->level);
+		if (ret) {
+			fsck_err(c,
+				 btree_root_read_error,
+				 "error reading btree root %s",
+				 bch2_btree_id_str(i));
+			if (btree_id_is_alloc(i))
+				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			ret = 0;
+		}
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = bch2_btree_id_root(c, i);
+
+		if (!r->b) {
+			r->alive = false;
+			r->level = 0;
+			bch2_btree_root_alloc(c, i);
+		}
+	}
+fsck_err:
+	return ret;
+}
+
+static int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+	struct bkey_i_snapshot_tree	root_tree;
+	struct bkey_i_snapshot		root_snapshot;
+	struct bkey_i_subvolume		root_volume;
+	int ret;
+
+	bkey_snapshot_tree_init(&root_tree.k_i);
+	root_tree.k.p.offset		= 1;
+	root_tree.v.master_subvol	= cpu_to_le32(1);
+	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
+
+	bkey_snapshot_init(&root_snapshot.k_i);
+	root_snapshot.k.p.offset = U32_MAX;
+	root_snapshot.v.flags	= 0;
+	root_snapshot.v.parent	= 0;
+	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+	root_snapshot.v.tree	= cpu_to_le32(1);
+	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+	bkey_subvolume_init(&root_volume.k_i);
+	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+	root_volume.v.flags	= 0;
+	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
+	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
+
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!bkey_is_inode(k.k)) {
+		bch_err(trans->c, "root inode not found");
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, &inode);
+	BUG_ON(ret);
+
+	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+	ret = bch2_inode_write(trans, &iter, &inode);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* set bi_subvol on root inode */
+noinline_for_stack
+static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+	int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+				__bch2_fs_upgrade_for_subvolumes(trans));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...)	#_fn,
+	BCH_RECOVERY_PASSES()
+#undef x
+	NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+	return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+	static const u8 map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+	};
+
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(map[i]);
+	return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+	static const u8 map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+	};
+
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(map[i]);
+	return ret;
+}
+
+static bool check_version_upgrade(struct bch_fs *c)
+{
+	unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
+	unsigned latest_version	= bcachefs_metadata_version_current;
+	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+	unsigned new_version = 0;
+
+	if (old_version < bcachefs_metadata_required_upgrade_below) {
+		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
+		    latest_compatible < bcachefs_metadata_required_upgrade_below)
+			new_version = latest_version;
+		else
+			new_version = latest_compatible;
+	} else {
+		switch (c->opts.version_upgrade) {
+		case BCH_VERSION_UPGRADE_compatible:
+			new_version = latest_compatible;
+			break;
+		case BCH_VERSION_UPGRADE_incompatible:
+			new_version = latest_version;
+			break;
+		case BCH_VERSION_UPGRADE_none:
+			new_version = old_version;
+			break;
+		}
+	}
+
+	if (new_version > old_version) {
+		struct printbuf buf = PRINTBUF;
+
+		if (old_version < bcachefs_metadata_required_upgrade_below)
+			prt_str(&buf, "Version upgrade required:\n");
+
+		if (old_version != c->sb.version) {
+			prt_str(&buf, "Version upgrade from ");
+			bch2_version_to_text(&buf, c->sb.version_upgrade_complete);
+			prt_str(&buf, " to ");
+			bch2_version_to_text(&buf, c->sb.version);
+			prt_str(&buf, " incomplete\n");
+		}
+
+		prt_printf(&buf, "Doing %s version upgrade from ",
+			   BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version)
+			   ? "incompatible" : "compatible");
+		bch2_version_to_text(&buf, old_version);
+		prt_str(&buf, " to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_newline(&buf);
+
+		u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
+		if (recovery_passes) {
+			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+				prt_str(&buf, "fsck required");
+			else {
+				prt_str(&buf, "running recovery passes: ");
+				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+			}
+
+			c->recovery_passes_explicit |= recovery_passes;
+			c->opts.fix_errors = FSCK_FIX_yes;
+		}
+
+		bch_info(c, "%s", buf.buf);
+
+		bch2_sb_upgrade(c, new_version);
+
+		printbuf_exit(&buf);
+		return true;
+	}
+
+	return false;
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+	u64 ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+		if (recovery_pass_fns[i].when & PASS_FSCK)
+			ret |= BIT_ULL(i);
+	return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+
+	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
+		return false;
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return true;
+	if ((p->when & PASS_FSCK) && c->opts.fsck)
+		return true;
+	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+		return true;
+	if (p->when & PASS_ALWAYS)
+		return true;
+	return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	int ret;
+
+	c->curr_recovery_pass = pass;
+
+	if (should_run_recovery_pass(c, pass)) {
+		struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_INFO bch2_log_msg(c, "%s..."),
+			       bch2_recovery_passes[pass]);
+		ret = p->fn(c);
+		if (ret)
+			return ret;
+		if (!(p->when & PASS_SILENT))
+			printk(KERN_CONT " done\n");
+
+		c->recovery_passes_complete |= BIT_ULL(pass);
+	}
+
+	return 0;
+}
+
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+		if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+			continue;
+		if (ret)
+			break;
+		c->curr_recovery_pass++;
+	}
+
+	return ret;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean = NULL;
+	struct jset *last_journal_entry = NULL;
+	u64 last_seq = 0, blacklist_seq, journal_seq;
+	int ret = 0;
+
+	if (c->sb.clean) {
+		clean = bch2_read_superblock_clean(c);
+		ret = PTR_ERR_OR_ZERO(clean);
+		if (ret)
+			goto err;
+
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+	} else {
+		bch_info(c, "recovering from unclean shutdown");
+	}
+
+	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (!c->sb.clean &&
+	    !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+		bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (c->opts.fsck && c->opts.norecovery) {
+		bch_err(c, "cannot select both norecovery and fsck");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (!(c->opts.nochanges && c->opts.norecovery)) {
+		mutex_lock(&c->sb_lock);
+		bool write_sb = false;
+
+		struct bch_sb_field_ext *ext =
+			bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
+		if (!ext) {
+			ret = -BCH_ERR_ENOSPC_sb;
+			mutex_unlock(&c->sb_lock);
+			goto err;
+		}
+
+		if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
+			ext->recovery_passes_required[0] |=
+				cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
+			write_sb = true;
+		}
+
+		u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+		if (sb_passes) {
+			struct printbuf buf = PRINTBUF;
+			prt_str(&buf, "superblock requires following recovery passes to be run:\n  ");
+			prt_bitflags(&buf, bch2_recovery_passes, sb_passes);
+			bch_info(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+		}
+
+		if (bch2_check_version_downgrade(c)) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "Version downgrade required:\n");
+
+			__le64 passes = ext->recovery_passes_required[0];
+			bch2_sb_set_downgrade(c,
+					BCH_VERSION_MINOR(bcachefs_metadata_version_current),
+					BCH_VERSION_MINOR(c->sb.version));
+			passes = ext->recovery_passes_required[0] & ~passes;
+			if (passes) {
+				prt_str(&buf, "  running recovery passes: ");
+				prt_bitflags(&buf, bch2_recovery_passes,
+					     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
+			}
+
+			bch_info(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			write_sb = true;
+		}
+
+		if (check_version_upgrade(c))
+			write_sb = true;
+
+		if (write_sb)
+			bch2_write_super(c);
+
+		c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+
+	ret = bch2_blacklist_table_initialize(c);
+	if (ret) {
+		bch_err(c, "error initializing blacklist table");
+		goto err;
+	}
+
+	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+		struct genradix_iter iter;
+		struct journal_replay **i;
+
+		bch_verbose(c, "starting journal read");
+		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
+		if (ret)
+			goto err;
+
+		/*
+		 * note: cmd_list_journal needs the blacklist table fully up to date so
+		 * it can asterisk ignored journal entries:
+		 */
+		if (c->opts.read_journal_only)
+			goto out;
+
+		genradix_for_each_reverse(&c->journal_entries, iter, i)
+			if (*i && !(*i)->ignore) {
+				last_journal_entry = &(*i)->j;
+				break;
+			}
+
+		if (mustfix_fsck_err_on(c->sb.clean &&
+					last_journal_entry &&
+					!journal_entry_empty(last_journal_entry), c,
+				clean_but_journal_not_empty,
+				"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
+
+		if (!last_journal_entry) {
+			fsck_err_on(!c->sb.clean, c,
+				    dirty_but_no_journal_entries,
+				    "no journal entries found");
+			if (clean)
+				goto use_clean;
+
+			genradix_for_each_reverse(&c->journal_entries, iter, i)
+				if (*i) {
+					last_journal_entry = &(*i)->j;
+					(*i)->ignore = false;
+					/*
+					 * This was probably a NO_FLUSH entry,
+					 * so last_seq was garbage - but we know
+					 * we're only using a single journal
+					 * entry, set it here:
+					 */
+					(*i)->j.last_seq = (*i)->j.seq;
+					break;
+				}
+		}
+
+		ret = bch2_journal_keys_sort(c);
+		if (ret)
+			goto err;
+
+		if (c->sb.clean && last_journal_entry) {
+			ret = bch2_verify_superblock_clean(c, &clean,
+						      last_journal_entry);
+			if (ret)
+				goto err;
+		}
+	} else {
+use_clean:
+		if (!clean) {
+			bch_err(c, "no superblock clean section found");
+			ret = -BCH_ERR_fsck_repair_impossible;
+			goto err;
+
+		}
+		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+	}
+
+	c->journal_replay_seq_start	= last_seq;
+	c->journal_replay_seq_end	= blacklist_seq - 1;
+
+	if (c->opts.reconstruct_alloc) {
+		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		drop_alloc_keys(&c->journal_keys);
+	}
+
+	zero_out_btree_mem_ptr(&c->journal_keys);
+
+	ret = journal_replay_early(c, clean);
+	if (ret)
+		goto err;
+
+	/*
+	 * After an unclean shutdown, skip then next few journal sequence
+	 * numbers as they may have been referenced by btree writes that
+	 * happened before their corresponding journal writes - those btree
+	 * writes need to be ignored, by skipping and blacklisting the next few
+	 * journal sequence numbers:
+	 */
+	if (!c->sb.clean)
+		journal_seq += 8;
+
+	if (blacklist_seq != journal_seq) {
+		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
+					     blacklist_seq, journal_seq) ?:
+			bch2_journal_seq_blacklist_add(c,
+					blacklist_seq, journal_seq);
+		if (ret) {
+			bch_err(c, "error creating new journal seq blacklist entry");
+			goto err;
+		}
+	}
+
+	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
+				     journal_seq, last_seq, blacklist_seq - 1) ?:
+		bch2_fs_journal_start(&c->journal, journal_seq);
+	if (ret)
+		goto err;
+
+	if (c->opts.reconstruct_alloc)
+		bch2_journal_log_msg(c, "dropping alloc info");
+
+	/*
+	 * Skip past versions that might have possibly been used (as nonces),
+	 * but hadn't had their pointers written:
+	 */
+	if (c->sb.encryption_type && !c->sb.clean)
+		atomic64_add(1 << 16, &c->key_version);
+
+	ret = read_btree_roots(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_run_recovery_passes(c);
+	if (ret)
+		goto err;
+
+	/* If we fixed errors, verify that fs is actually clean now: */
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+	    test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags)) {
+		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
+		clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+
+		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+
+		ret = bch2_run_recovery_passes(c);
+		if (ret)
+			goto err;
+
+		if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
+		    test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+			bch_err(c, "Second fsck run was not clean");
+			set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+		}
+
+		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+	}
+
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
+	mutex_lock(&c->sb_lock);
+	bool write_sb = false;
+
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version));
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
+		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+		write_sb = true;
+	}
+
+	if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+		if (ext &&
+		    (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
+		     !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
+			memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
+			memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+			write_sb = true;
+		}
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
+		write_sb = true;
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
+		struct bch_move_stats stats;
+
+		bch2_move_stats_init(&stats, "recovery");
+
+		bch_info(c, "scanning for old btree nodes");
+		ret =   bch2_fs_read_write(c) ?:
+			bch2_scan_old_btree_nodes(c, &stats);
+		if (ret)
+			goto err;
+		bch_info(c, "scanning for old btree nodes done");
+	}
+
+	if (c->journal_seq_blacklist_table &&
+	    c->journal_seq_blacklist_table->nr > 128)
+		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
+
+	ret = 0;
+out:
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+	bch2_flush_fsck_errs(c);
+
+	if (!c->opts.keep_journal &&
+	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		bch2_journal_keys_put_initial(c);
+	kfree(clean);
+
+	if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
+		bch2_fs_read_write_early(c);
+		bch2_delete_dead_snapshots_async(c);
+	}
+
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+err:
+fsck_err:
+	bch2_fs_emergency_read_only(c);
+	goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bkey_inode_buf packed_inode;
+	struct qstr lostfound = QSTR("lost+found");
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	bch_notice(c, "initializing new filesystem");
+
+	mutex_lock(&c->sb_lock);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
+
+	bch2_check_version_downgrade(c);
+
+	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+
+	c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
+	for_each_member_device(ca, c, i)
+		bch2_dev_usage_init(ca);
+
+	ret = bch2_fs_journal_alloc(c);
+	if (ret)
+		goto err;
+
+	/*
+	 * journal_res_get() will crash if called before this has
+	 * set up the journal.pin FIFO and journal.cur pointer:
+	 */
+	bch2_fs_journal_start(&c->journal, 1);
+	bch2_journal_set_replay_done(&c->journal);
+
+	ret = bch2_fs_read_write_early(c);
+	if (ret)
+		goto err;
+
+	/*
+	 * Write out the superblock and journal buckets, now that we can do
+	 * btree updates
+	 */
+	bch_verbose(c, "marking superblocks");
+	ret = bch2_trans_mark_dev_sbs(c);
+	bch_err_msg(c, ret, "marking superblocks");
+	if (ret)
+		goto err;
+
+	for_each_online_member(ca, c, i)
+		ca->new_fs_bucket_idx = 0;
+
+	ret = bch2_fs_freespace_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_initialize_subvolumes(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "reading snapshots table");
+	ret = bch2_snapshots_read(c);
+	if (ret)
+		goto err;
+	bch_verbose(c, "reading snapshots done");
+
+	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
+	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
+	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
+	bch2_inode_pack(&packed_inode, &root_inode);
+	packed_inode.inode.k.p.snapshot = U32_MAX;
+
+	ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+	if (ret) {
+		bch_err_msg(c, ret, "creating root directory");
+		goto err;
+	}
+
+	bch2_inode_init_early(c, &lostfound_inode);
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_create_trans(trans,
+				  BCACHEFS_ROOT_SUBVOL_INUM,
+				  &root_inode, &lostfound_inode,
+				  &lostfound,
+				  0, 0, S_IFDIR|0700, 0,
+				  NULL, NULL, (subvol_inum) { 0 }, 0));
+	if (ret) {
+		bch_err_msg(c, ret, "creating lost+found");
+		goto err;
+	}
+
+	if (enabled_qtypes(c)) {
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret) {
+		bch_err_msg(c, ret, "writing first journal entry");
+		goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+err:
+	bch_err_fn(ca, ret);
+	return ret;
+}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
new file mode 100644
index 000000000000..3a554b0751d0
--- /dev/null
+++ b/fs/bcachefs/recovery.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+						  enum bch_recovery_pass pass)
+{
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return 0;
+
+	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+		 bch2_recovery_passes[pass], pass,
+		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+	c->recovery_passes_explicit |= BIT_ULL(pass);
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		c->recovery_passes_complete &= (1ULL << pass) >> 1;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
+}
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
new file mode 100644
index 000000000000..d37c6fd30e38
--- /dev/null
+++ b/fs/bcachefs/recovery_types.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_TYPES_H
+#define _BCACHEFS_RECOVERY_TYPES_H
+
+#define PASS_SILENT		BIT(0)
+#define PASS_FSCK		BIT(1)
+#define PASS_UNCLEAN		BIT(2)
+#define PASS_ALWAYS		BIT(3)
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES()							\
+	x(alloc_read,				 0, PASS_ALWAYS)		\
+	x(stripes_read,				 1, PASS_ALWAYS)		\
+	x(initialize_subvolumes,		 2, 0)				\
+	x(snapshots_read,			 3, PASS_ALWAYS)		\
+	x(check_topology,			 4, 0)				\
+	x(check_allocations,			 5, PASS_FSCK)			\
+	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
+	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\
+	x(set_may_go_rw,			 8, PASS_ALWAYS|PASS_SILENT)	\
+	x(journal_replay,			 9, PASS_ALWAYS)		\
+	x(check_alloc_info,			10, PASS_FSCK)			\
+	x(check_lrus,				11, PASS_FSCK)			\
+	x(check_btree_backpointers,		12, PASS_FSCK)			\
+	x(check_backpointers_to_extents,	13, PASS_FSCK)			\
+	x(check_extents_to_backpointers,	14, PASS_FSCK)			\
+	x(check_alloc_to_lru_refs,		15, PASS_FSCK)			\
+	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)	\
+	x(bucket_gens_init,			17, 0)				\
+	x(check_snapshot_trees,			18, PASS_FSCK)			\
+	x(check_snapshots,			19, PASS_FSCK)			\
+	x(check_subvols,			20, PASS_FSCK)			\
+	x(delete_dead_snapshots,		21, PASS_FSCK)			\
+	x(fs_upgrade_for_subvolumes,		22, 0)				\
+	x(resume_logged_ops,			23, PASS_ALWAYS)		\
+	x(check_inodes,				24, PASS_FSCK)			\
+	x(check_extents,			25, PASS_FSCK)			\
+	x(check_indirect_extents,		26, PASS_FSCK)			\
+	x(check_dirents,			27, PASS_FSCK)			\
+	x(check_xattrs,				28, PASS_FSCK)			\
+	x(check_root,				29, PASS_FSCK)			\
+	x(check_directory_structure,		30, PASS_FSCK)			\
+	x(check_nlinks,				31, PASS_FSCK)			\
+	x(delete_dead_inodes,			32, PASS_FSCK|PASS_UNCLEAN)	\
+	x(fix_reflink_p,			33, 0)				\
+	x(set_fs_needs_rebalance,		34, 0)				\
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when)	BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when)	BCH_RECOVERY_PASS_STABLE_##n = id,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+#endif /* _BCACHEFS_RECOVERY_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
new file mode 100644
index 000000000000..37d16e04e671
--- /dev/null
+++ b/fs/bcachefs/reflink.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "rebalance.h"
+#include "reflink.h"
+#include "subvolume.h"
+#include "super-io.h"
+
+#include <linux/sched/signal.h>
+
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
+{
+	switch (k->type) {
+	case KEY_TYPE_extent:
+		return KEY_TYPE_reflink_v;
+	case KEY_TYPE_inline_data:
+		return KEY_TYPE_indirect_inline_data;
+	default:
+		return 0;
+	}
+}
+
+/* reflink pointers */
+
+int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+	    le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
+		prt_printf(err, "idx < front_pad (%llu < %u)",
+		       le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+	prt_printf(out, "idx %llu front_pad %u back_pad %u",
+	       le64_to_cpu(p.v->idx),
+	       le32_to_cpu(p.v->front_pad),
+	       le32_to_cpu(p.v->back_pad));
+}
+
+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+	struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r);
+
+	/*
+	 * Disabled for now, the triggers code needs to be reworked for merging
+	 * of reflink pointers to work:
+	 */
+	return false;
+
+	if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+		return false;
+
+	bch2_key_resize(l.k, l.k->size + r.k->size);
+	return true;
+}
+
+/* indirect extents */
+
+int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags,
+			   struct printbuf *err)
+{
+	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+	prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+	bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+#if 0
+Currently disabled, needs to be debugged:
+
+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
+{
+	struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
+	struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r);
+
+	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
+}
+#endif
+
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+	if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+		new->k.type = KEY_TYPE_deleted;
+		new->k.size = 0;
+		set_bkey_val_u64s(&new->k, 0);;
+		*flags &= ~BTREE_TRIGGER_INSERT;
+	}
+}
+
+int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	check_indirect_extent_deleting(new, &flags);
+
+	if (old.k->type == KEY_TYPE_reflink_v &&
+	    new->k.type == KEY_TYPE_reflink_v &&
+	    old.k->u64s == new->k.u64s &&
+	    !memcmp(bkey_s_c_to_reflink_v(old).v->start,
+		    bkey_i_to_reflink_v(new)->v.start,
+		    bkey_val_bytes(&new->k) - 8))
+		return 0;
+
+	return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+}
+
+/* indirect inline data */
+
+int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+				      enum bkey_invalid_flags flags,
+				      struct printbuf *err)
+{
+	return 0;
+}
+
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
+				       struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
+	unsigned datalen = bkey_inline_data_bytes(k.k);
+
+	prt_printf(out, "refcount %llu datalen %u: %*phN",
+	       le64_to_cpu(d.v->refcount), datalen,
+	       min(datalen, 32U), d.v->data);
+}
+
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+			      enum btree_id btree_id, unsigned level,
+			      struct bkey_s_c old, struct bkey_i *new,
+			      unsigned flags)
+{
+	check_indirect_extent_deleting(new, &flags);
+
+	return 0;
+}
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+				     struct btree_iter *extent_iter,
+				     struct bkey_i *orig)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter reflink_iter = { NULL };
+	struct bkey_s_c k;
+	struct bkey_i *r_v;
+	struct bkey_i_reflink_p *r_p;
+	__le64 *refcount;
+	int ret;
+
+	if (orig->k.type == KEY_TYPE_inline_data)
+		bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
+
+	bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_prev(&reflink_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
+	ret = PTR_ERR_OR_ZERO(r_v);
+	if (ret)
+		goto err;
+
+	bkey_init(&r_v->k);
+	r_v->k.type	= bkey_type_to_indirect(&orig->k);
+	r_v->k.p	= reflink_iter.pos;
+	bch2_key_resize(&r_v->k, orig->k.size);
+	r_v->k.version	= orig->k.version;
+
+	set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
+
+	refcount	= bkey_refcount(r_v);
+	*refcount	= 0;
+	memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
+
+	ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
+	if (ret)
+		goto err;
+
+	/*
+	 * orig is in a bkey_buf which statically allocates 5 64s for the val,
+	 * so we know it will be big enough:
+	 */
+	orig->k.type = KEY_TYPE_reflink_p;
+	r_p = bkey_i_to_reflink_p(orig);
+	set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+
+	/* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */
+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+	__underlying_memset(&r_p->v, 0, sizeof(r_p->v));
+#else
+	memset(&r_p->v, 0, sizeof(r_p->v));
+#endif
+
+	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
+	bch2_trans_iter_exit(trans, &reflink_iter);
+
+	return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) {
+		if (bkey_extent_is_unwritten(k))
+			continue;
+
+		if (bkey_extent_is_data(k.k))
+			return k;
+	}
+
+	if (bkey_ge(iter->pos, end))
+		bch2_btree_iter_set_pos(iter, end);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+		     subvol_inum dst_inum, u64 dst_offset,
+		     subvol_inum src_inum, u64 src_offset,
+		     u64 remap_sectors,
+		     u64 new_i_size, s64 *i_sectors_delta)
+{
+	struct btree_trans *trans;
+	struct btree_iter dst_iter, src_iter;
+	struct bkey_s_c src_k;
+	struct bkey_buf new_dst, new_src;
+	struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+	struct bpos src_start = POS(src_inum.inum, src_offset);
+	struct bpos dst_end = dst_start, src_end = src_start;
+	struct bch_io_opts opts;
+	struct bpos src_want;
+	u64 dst_done = 0;
+	u32 dst_snapshot, src_snapshot;
+	int ret = 0, ret2 = 0;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
+		return -BCH_ERR_erofs_no_writes;
+
+	bch2_check_set_feature(c, BCH_FEATURE_reflink);
+
+	dst_end.offset += remap_sectors;
+	src_end.offset += remap_sectors;
+
+	bch2_bkey_buf_init(&new_dst);
+	bch2_bkey_buf_init(&new_src);
+	trans = bch2_trans_get(c);
+
+	ret = bch2_inum_opts_get(trans, src_inum, &opts);
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
+			     BTREE_ITER_INTENT);
+	bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
+			     BTREE_ITER_INTENT);
+
+	while ((ret == 0 ||
+		bch2_err_matches(ret, BCH_ERR_transaction_restart)) &&
+	       bkey_lt(dst_iter.pos, dst_end)) {
+		struct disk_reservation disk_res = { 0 };
+
+		bch2_trans_begin(trans);
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
+						  &src_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+		ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
+						  &dst_snapshot);
+		if (ret)
+			continue;
+
+		bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+		dst_done = dst_iter.pos.offset - dst_start.offset;
+		src_want = POS(src_start.inode, src_start.offset + dst_done);
+		bch2_btree_iter_set_pos(&src_iter, src_want);
+
+		src_k = get_next_src(&src_iter, src_end);
+		ret = bkey_err(src_k);
+		if (ret)
+			continue;
+
+		if (bkey_lt(src_want, src_iter.pos)) {
+			ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
+					min(dst_end.offset,
+					    dst_iter.pos.offset +
+					    src_iter.pos.offset - src_want.offset),
+					i_sectors_delta);
+			continue;
+		}
+
+		if (src_k.k->type != KEY_TYPE_reflink_p) {
+			bch2_btree_iter_set_pos_to_extent_start(&src_iter);
+
+			bch2_bkey_buf_reassemble(&new_src, c, src_k);
+			src_k = bkey_i_to_s_c(new_src.k);
+
+			ret = bch2_make_extent_indirect(trans, &src_iter,
+						new_src.k);
+			if (ret)
+				continue;
+
+			BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+		}
+
+		if (src_k.k->type == KEY_TYPE_reflink_p) {
+			struct bkey_s_c_reflink_p src_p =
+				bkey_s_c_to_reflink_p(src_k);
+			struct bkey_i_reflink_p *dst_p =
+				bkey_reflink_p_init(new_dst.k);
+
+			u64 offset = le64_to_cpu(src_p.v->idx) +
+				(src_want.offset -
+				 bkey_start_offset(src_k.k));
+
+			dst_p->v.idx = cpu_to_le64(offset);
+		} else {
+			BUG();
+		}
+
+		new_dst.k->k.p = dst_iter.pos;
+		bch2_key_resize(&new_dst.k->k,
+				min(src_k.k->p.offset - src_want.offset,
+				    dst_end.offset - dst_iter.pos.offset));
+
+		ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k,
+					opts.background_target,
+					opts.background_compression) ?:
+			bch2_extent_update(trans, dst_inum, &dst_iter,
+					new_dst.k, &disk_res,
+					new_i_size, i_sectors_delta,
+					true);
+		bch2_disk_reservation_put(c, &disk_res);
+	}
+	bch2_trans_iter_exit(trans, &dst_iter);
+	bch2_trans_iter_exit(trans, &src_iter);
+
+	BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
+	BUG_ON(bkey_gt(dst_iter.pos, dst_end));
+
+	dst_done = dst_iter.pos.offset - dst_start.offset;
+	new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
+
+	do {
+		struct bch_inode_unpacked inode_u;
+		struct btree_iter inode_iter = { NULL };
+
+		bch2_trans_begin(trans);
+
+		ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
+				       dst_inum, BTREE_ITER_INTENT);
+
+		if (!ret2 &&
+		    inode_u.bi_size < new_i_size) {
+			inode_u.bi_size = new_i_size;
+			ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+				bch2_trans_commit(trans, NULL, NULL,
+						  BTREE_INSERT_NOFAIL);
+		}
+
+		bch2_trans_iter_exit(trans, &inode_iter);
+	} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
+err:
+	bch2_trans_put(trans);
+	bch2_bkey_buf_exit(&new_src, c);
+	bch2_bkey_buf_exit(&new_dst, c);
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+
+	return dst_done ?: ret ?: ret2;
+}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
new file mode 100644
index 000000000000..8ccf3f9c4939
--- /dev/null
+++ b/fs/bcachefs/reflink.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+enum bkey_invalid_flags;
+
+int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+
+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_p_invalid,		\
+	.val_to_text	= bch2_reflink_p_to_text,		\
+	.key_merge	= bch2_reflink_p_merge,			\
+	.trans_trigger	= bch2_trans_mark_reflink_p,		\
+	.atomic_trigger	= bch2_mark_reflink_p,			\
+	.min_val_size	= 16,					\
+})
+
+int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+			    struct bkey_s_c);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_i *, unsigned);
+
+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
+	.key_invalid	= bch2_reflink_v_invalid,		\
+	.val_to_text	= bch2_reflink_v_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+	.trans_trigger	= bch2_trans_mark_reflink_v,		\
+	.atomic_trigger	= bch2_mark_extent,			\
+	.min_val_size	= 8,					\
+})
+
+int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
+				      enum bkey_invalid_flags, struct printbuf *);
+void bch2_indirect_inline_data_to_text(struct printbuf *,
+				struct bch_fs *, struct bkey_s_c);
+int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+					 enum btree_id, unsigned,
+			      struct bkey_s_c, struct bkey_i *,
+			      unsigned);
+
+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
+	.key_invalid	= bch2_indirect_inline_data_invalid,	\
+	.val_to_text	= bch2_indirect_inline_data_to_text,	\
+	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
+	.min_val_size	= 8,					\
+})
+
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_s_c_to_reflink_v(k).v->refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+	default:
+		return NULL;
+	}
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+	switch (k->k.type) {
+	case KEY_TYPE_reflink_v:
+		return &bkey_i_to_reflink_v(k)->v.refcount;
+	case KEY_TYPE_indirect_inline_data:
+		return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+	default:
+		return NULL;
+	}
+}
+
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+		     subvol_inum, u64, u64, u64, s64 *);
+
+#endif /* _BCACHEFS_REFLINK_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
new file mode 100644
index 000000000000..2008fe8bf706
--- /dev/null
+++ b/fs/bcachefs/replicas.c
@@ -0,0 +1,1059 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets.h"
+#include "journal.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+static void verify_replicas_entry(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	unsigned i;
+
+	BUG_ON(e->data_type >= BCH_DATA_NR);
+	BUG_ON(!e->nr_devs);
+	BUG_ON(e->nr_required > 1 &&
+	       e->nr_required >= e->nr_devs);
+
+	for (i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+{
+	bubble_sort(e->devs, e->nr_devs, u8_cmp);
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
+					   struct bch_replicas_entry_v0 *e)
+{
+	unsigned i;
+
+	if (e->data_type < BCH_DATA_NR)
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		prt_printf(out, "(invalid data type %u)", e->data_type);
+
+	prt_printf(out, ": %u [", e->nr_devs);
+	for (i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
+}
+
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry *e)
+{
+	unsigned i;
+
+	if (e->data_type < BCH_DATA_NR)
+		prt_printf(out, "%s", bch2_data_types[e->data_type]);
+	else
+		prt_printf(out, "(invalid data type %u)", e->data_type);
+
+	prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
+	for (i = 0; i < e->nr_devs; i++)
+		prt_printf(out, i ? " %u" : "%u", e->devs[i]);
+	prt_printf(out, "]");
+}
+
+int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
+				 struct bch_sb *sb,
+				 struct printbuf *err)
+{
+	if (!r->nr_devs) {
+		prt_printf(err, "no devices in entry ");
+		goto bad;
+	}
+
+	if (r->nr_required > 1 &&
+	    r->nr_required >= r->nr_devs) {
+		prt_printf(err, "bad nr_required in entry ");
+		goto bad;
+	}
+
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (!bch2_dev_exists(sb, r->devs[i])) {
+			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+			goto bad;
+		}
+
+	return 0;
+bad:
+	bch2_replicas_entry_to_text(err, r);
+	return -BCH_ERR_invalid_replicas_entry;
+}
+
+void bch2_cpu_replicas_to_text(struct printbuf *out,
+			       struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+}
+
+static void extent_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+
+	r->nr_required	= 1;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.ptr.cached)
+			continue;
+
+		if (!p.has_ec)
+			r->devs[r->nr_devs++] = p.ptr.dev;
+		else
+			r->nr_required = 0;
+	}
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+			       struct bch_replicas_entry *r)
+{
+	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+	const struct bch_extent_ptr *ptr;
+
+	r->nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+	for (ptr = s.v->ptrs;
+	     ptr < s.v->ptrs + s.v->nr_blocks;
+	     ptr++)
+		r->devs[r->nr_devs++] = ptr->dev;
+}
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+			   struct bkey_s_c k)
+{
+	e->nr_devs = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		e->data_type = BCH_DATA_btree;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		e->data_type = BCH_DATA_user;
+		extent_to_replicas(k, e);
+		break;
+	case KEY_TYPE_stripe:
+		e->data_type = BCH_DATA_parity;
+		stripe_to_replicas(k, e);
+		break;
+	}
+
+	bch2_replicas_entry_sort(e);
+}
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_sb ||
+	       data_type >= BCH_DATA_NR);
+
+	e->data_type	= data_type;
+	e->nr_devs	= 0;
+	e->nr_required	= 1;
+
+	for (i = 0; i < devs.nr; i++)
+		e->devs[e->nr_devs++] = devs.devs[i];
+
+	bch2_replicas_entry_sort(e);
+}
+
+static struct bch_replicas_cpu
+cpu_replicas_add_entry(struct bch_fs *c,
+		       struct bch_replicas_cpu *old,
+		       struct bch_replicas_entry *new_entry)
+{
+	unsigned i;
+	struct bch_replicas_cpu new = {
+		.nr		= old->nr + 1,
+		.entry_size	= max_t(unsigned, old->entry_size,
+					replicas_entry_bytes(new_entry)),
+	};
+
+	for (i = 0; i < new_entry->nr_devs; i++)
+		BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
+
+	BUG_ON(!new_entry->data_type);
+	verify_replicas_entry(new_entry);
+
+	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries)
+		return new;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(&new, i),
+		       cpu_replicas_entry(old, i),
+		       old->entry_size);
+
+	memcpy(cpu_replicas_entry(&new, old->nr),
+	       new_entry,
+	       replicas_entry_bytes(new_entry));
+
+	bch2_cpu_replicas_sort(&new);
+	return new;
+}
+
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry *search)
+{
+	int idx, entry_size = replicas_entry_bytes(search);
+
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
+
+	verify_replicas_entry(search);
+
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
+
+	return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry *search)
+{
+	bch2_replicas_entry_sort(search);
+
+	return __replicas_entry_idx(&c->replicas, search);
+}
+
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
+				 struct bch_replicas_entry *search)
+{
+	return __replicas_entry_idx(r, search) >= 0;
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search)
+{
+	bool marked;
+
+	if (!search->nr_devs)
+		return true;
+
+	verify_replicas_entry(search);
+
+	percpu_down_read(&c->mark_lock);
+	marked = __replicas_has_entry(&c->replicas, search) &&
+		(likely((!c->replicas_gc.entries)) ||
+		 __replicas_has_entry(&c->replicas_gc, search));
+	percpu_up_read(&c->mark_lock);
+
+	return marked;
+}
+
+static void __replicas_table_update(struct bch_fs_usage *dst,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage *src,
+				    struct bch_replicas_cpu *src_r)
+{
+	int src_idx, dst_idx;
+
+	*dst = *src;
+
+	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+		if (!src->replicas[src_idx])
+			continue;
+
+		dst_idx = __replicas_entry_idx(dst_r,
+				cpu_replicas_entry(src_r, src_idx));
+		BUG_ON(dst_idx < 0);
+
+		dst->replicas[dst_idx] = src->replicas[src_idx];
+	}
+}
+
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	__replicas_table_update(dst, dst_r, src, src_r);
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+				 struct bch_replicas_cpu *new_r)
+{
+	struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
+	struct bch_fs_usage_online *new_scratch = NULL;
+	struct bch_fs_usage __percpu *new_gc = NULL;
+	struct bch_fs_usage *new_base = NULL;
+	unsigned i, bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
+		sizeof(u64) * new_r->nr;
+	int ret = 0;
+
+	memset(new_usage, 0, sizeof(new_usage));
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+					sizeof(u64), GFP_KERNEL)))
+			goto err;
+
+	if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+	    !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
+	    (c->usage_gc &&
+	     !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		if (c->usage[i])
+			__replicas_table_update_pcpu(new_usage[i], new_r,
+						     c->usage[i], &c->replicas);
+	if (c->usage_base)
+		__replicas_table_update(new_base,		new_r,
+					c->usage_base,		&c->replicas);
+	if (c->usage_gc)
+		__replicas_table_update_pcpu(new_gc,		new_r,
+					     c->usage_gc,	&c->replicas);
+
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		swap(c->usage[i],	new_usage[i]);
+	swap(c->usage_base,	new_base);
+	swap(c->usage_scratch,	new_scratch);
+	swap(c->usage_gc,	new_gc);
+	swap(c->replicas,	*new_r);
+out:
+	free_percpu(new_gc);
+	kfree(new_scratch);
+	for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+		free_percpu(new_usage[i]);
+	kfree(new_base);
+	return ret;
+err:
+	bch_err(c, "error updating replicas table: memory allocation failure");
+	ret = -BCH_ERR_ENOMEM_replicas_table;
+	goto out;
+}
+
+static unsigned reserve_journal_replicas(struct bch_fs *c,
+				     struct bch_replicas_cpu *r)
+{
+	struct bch_replicas_entry *e;
+	unsigned journal_res_u64s = 0;
+
+	/* nr_inodes: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* key_version: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
+
+	/* persistent_reserved: */
+	journal_res_u64s +=
+		DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
+		BCH_REPLICAS_MAX;
+
+	for_each_cpu_replicas_entry(r, e)
+		journal_res_u64s +=
+			DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
+				     e->nr_devs, sizeof(u64));
+	return journal_res_u64s;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_entry *new_entry)
+{
+	struct bch_replicas_cpu new_r, new_gc;
+	int ret = 0;
+
+	verify_replicas_entry(new_entry);
+
+	memset(&new_r, 0, sizeof(new_r));
+	memset(&new_gc, 0, sizeof(new_gc));
+
+	mutex_lock(&c->sb_lock);
+
+	if (c->replicas_gc.entries &&
+	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
+		if (!new_gc.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			goto err;
+		}
+	}
+
+	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
+		if (!new_r.entries) {
+			ret = -BCH_ERR_ENOMEM_cpu_replicas;
+			goto err;
+		}
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
+		if (ret)
+			goto err;
+
+		bch2_journal_entry_res_resize(&c->journal,
+				&c->replicas_journal_res,
+				reserve_journal_replicas(c, &new_r));
+	}
+
+	if (!new_r.entries &&
+	    !new_gc.entries)
+		goto out;
+
+	/* allocations done, now commit: */
+
+	if (new_r.entries)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+	percpu_down_write(&c->mark_lock);
+	if (new_r.entries)
+		ret = replicas_table_update(c, &new_r);
+	if (new_gc.entries)
+		swap(new_gc, c->replicas_gc);
+	percpu_up_write(&c->mark_lock);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	kfree(new_r.entries);
+	kfree(new_gc.entries);
+
+	return ret;
+err:
+	bch_err_msg(c, ret, "adding replicas entry");
+	goto out;
+}
+
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+{
+	return likely(bch2_replicas_marked(c, r))
+		? 0 : bch2_mark_replicas_slowpath(c, r);
+}
+
+/* replicas delta list: */
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+				  struct replicas_delta_list *r)
+{
+	struct replicas_delta *d = r->d;
+	struct replicas_delta *top = (void *) r->d + r->used;
+	int ret = 0;
+
+	for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+		ret = bch2_mark_replicas(c, &d->r);
+	return ret;
+}
+
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	ret =   ret ?:
+		bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
+		replicas_table_update(c, &c->replicas_gc);
+
+	kfree(c->replicas_gc.entries);
+	c->replicas_gc.entries = NULL;
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_entry *e;
+	unsigned i = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc.entries);
+
+	c->replicas_gc.nr		= 0;
+	c->replicas_gc.entry_size	= 0;
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask)) {
+			c->replicas_gc.nr++;
+			c->replicas_gc.entry_size =
+				max_t(unsigned, c->replicas_gc.entry_size,
+				      replicas_entry_bytes(e));
+		}
+
+	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
+					 c->replicas_gc.entry_size,
+					 GFP_KERNEL);
+	if (!c->replicas_gc.entries) {
+		mutex_unlock(&c->sb_lock);
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
+
+	for_each_cpu_replicas_entry(&c->replicas, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
+			       e, c->replicas_gc.entry_size);
+
+	bch2_cpu_replicas_sort(&c->replicas_gc);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/*
+ * New much simpler mechanism for clearing out unneeded replicas entries - drop
+ * replicas entries that have 0 sectors used.
+ *
+ * However, we don't track sector counts for journal usage, so this doesn't drop
+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
+ * is retained for that.
+ */
+int bch2_replicas_gc2(struct bch_fs *c)
+{
+	struct bch_replicas_cpu new = { 0 };
+	unsigned i, nr;
+	int ret = 0;
+
+	bch2_journal_meta(&c->journal);
+retry:
+	nr		= READ_ONCE(c->replicas.nr);
+	new.entry_size	= READ_ONCE(c->replicas.entry_size);
+	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
+	if (!new.entries) {
+		bch_err(c, "error allocating c->replicas_gc");
+		return -BCH_ERR_ENOMEM_replicas_gc;
+	}
+
+	mutex_lock(&c->sb_lock);
+	percpu_down_write(&c->mark_lock);
+
+	if (nr			!= c->replicas.nr ||
+	    new.entry_size	!= c->replicas.entry_size) {
+		percpu_up_write(&c->mark_lock);
+		mutex_unlock(&c->sb_lock);
+		kfree(new.entries);
+		goto retry;
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+
+		if (e->data_type == BCH_DATA_journal ||
+		    c->usage_base->replicas[i] ||
+		    percpu_u64_get(&c->usage[0]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[1]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[2]->replicas[i]) ||
+		    percpu_u64_get(&c->usage[3]->replicas[i]))
+			memcpy(cpu_replicas_entry(&new, new.nr++),
+			       e, new.entry_size);
+	}
+
+	bch2_cpu_replicas_sort(&new);
+
+	ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
+		replicas_table_update(c, &new);
+
+	kfree(new.entries);
+
+	percpu_up_write(&c->mark_lock);
+
+	if (!ret)
+		bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_replicas_set_usage(struct bch_fs *c,
+			    struct bch_replicas_entry *r,
+			    u64 sectors)
+{
+	int ret, idx = bch2_replicas_entry_idx(c, r);
+
+	if (idx < 0) {
+		struct bch_replicas_cpu n;
+
+		n = cpu_replicas_add_entry(c, &c->replicas, r);
+		if (!n.entries)
+			return -BCH_ERR_ENOMEM_cpu_replicas;
+
+		ret = replicas_table_update(c, &n);
+		if (ret)
+			return ret;
+
+		kfree(n.entries);
+
+		idx = bch2_replicas_entry_idx(c, r);
+		BUG_ON(ret < 0);
+	}
+
+	c->usage_base->replicas[idx] = sectors;
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static int
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
+				   struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry *e, *dst;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(dst, e, replicas_entry_bytes(e));
+		bch2_replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+static int
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
+				      struct bch_replicas_cpu *cpu_r)
+{
+	struct bch_replicas_entry_v0 *e;
+	unsigned nr = 0, entry_size = 0, idx = 0;
+
+	for_each_replicas_entry(sb_r, e) {
+		entry_size = max_t(unsigned, entry_size,
+				   replicas_entry_bytes(e));
+		nr++;
+	}
+
+	entry_size += sizeof(struct bch_replicas_entry) -
+		sizeof(struct bch_replicas_entry_v0);
+
+	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
+	if (!cpu_r->entries)
+		return -BCH_ERR_ENOMEM_cpu_replicas;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	for_each_replicas_entry(sb_r, e) {
+		struct bch_replicas_entry *dst =
+			cpu_replicas_entry(cpu_r, idx++);
+
+		dst->data_type	= e->data_type;
+		dst->nr_devs	= e->nr_devs;
+		dst->nr_required = 1;
+		memcpy(dst->devs, e->devs, e->nr_devs);
+		bch2_replicas_entry_sort(dst);
+	}
+
+	return 0;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_v1;
+	struct bch_sb_field_replicas_v0 *sb_v0;
+	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	int ret = 0;
+
+	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
+		ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
+	else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
+		ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
+	if (ret)
+		return ret;
+
+	bch2_cpu_replicas_sort(&new_r);
+
+	percpu_down_write(&c->mark_lock);
+
+	ret = replicas_table_update(c, &new_r);
+	percpu_up_write(&c->mark_lock);
+
+	kfree(new_r.entries);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
+					       struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas_v0 *sb_r;
+	struct bch_replicas_entry_v0 *dst;
+	struct bch_replicas_entry *src;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src)
+		bytes += replicas_entry_bytes(src) - 1;
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
+		memcpy(dst->devs, src->devs, src->nr_devs);
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *dst, *src;
+	bool need_v1 = false;
+	size_t bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, src) {
+		bytes += replicas_entry_bytes(src);
+		if (src->nr_required != 1)
+			need_v1 = true;
+	}
+
+	if (!need_v1)
+		return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
+
+	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
+			DIV_ROUND_UP(bytes, sizeof(u64)));
+	if (!sb_r)
+		return -BCH_ERR_ENOSPC_sb_replicas;
+
+	bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
+	sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	dst = sb_r->entries;
+	for_each_cpu_replicas_entry(r, src) {
+		memcpy(dst, src, replicas_entry_bytes(src));
+
+		dst = replicas_entry_next(dst);
+
+		BUG_ON((void *) dst > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+				      struct bch_sb *sb,
+				      struct printbuf *err)
+{
+	unsigned i;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i < cpu_r->nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(cpu_r, i);
+
+		int ret = bch2_replicas_entry_validate(e, sb, err);
+		if (ret)
+			return ret;
+
+		if (i + 1 < cpu_r->nr) {
+			struct bch_replicas_entry *n =
+				cpu_replicas_entry(cpu_r, i + 1);
+
+			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
+
+			if (!memcmp(e, n, cpu_r->entry_size)) {
+				prt_printf(err, "duplicate replicas entry ");
+				bch2_replicas_entry_to_text(err, e);
+				return -BCH_ERR_invalid_sb_replicas;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				     struct printbuf *err)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
+
+	ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
+
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
+	return ret;
+}
+
+static void bch2_sb_replicas_to_text(struct printbuf *out,
+				     struct bch_sb *sb,
+				     struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
+	struct bch_replicas_entry *e;
+	bool first = true;
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_to_text(out, e);
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_replicas_validate,
+	.to_text	= bch2_sb_replicas_to_text,
+};
+
+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
+					struct printbuf *err)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_cpu cpu_r;
+	int ret;
+
+	ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r);
+	if (ret)
+		return ret;
+
+	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
+	kfree(cpu_r.entries);
+	return ret;
+}
+
+static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
+	struct bch_replicas_entry_v0 *e;
+	bool first = true;
+
+	for_each_replicas_entry(sb_r, e) {
+		if (!first)
+			prt_printf(out, " ");
+		first = false;
+
+		bch2_replicas_entry_v0_to_text(out, e);
+	}
+	prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
+	.validate	= bch2_sb_replicas_v0_validate,
+	.to_text	= bch2_sb_replicas_v0_to_text,
+};
+
+/* Query replicas: */
+
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+			   unsigned flags, bool print)
+{
+	struct bch_replicas_entry *e;
+	bool ret = true;
+
+	percpu_down_read(&c->mark_lock);
+	for_each_cpu_replicas_entry(&c->replicas, e) {
+		unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+		bool metadata = e->data_type < BCH_DATA_user;
+
+		if (e->data_type == BCH_DATA_cached)
+			continue;
+
+		for (i = 0; i < e->nr_devs; i++) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
+
+			nr_online += test_bit(e->devs[i], devs.d);
+			nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
+		}
+
+		if (nr_failed == e->nr_devs)
+			continue;
+
+		if (nr_online < e->nr_required)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_LOST
+				: BCH_FORCE_IF_DATA_LOST;
+
+		if (nr_online < e->nr_devs)
+			dflags |= metadata
+				? BCH_FORCE_IF_METADATA_DEGRADED
+				: BCH_FORCE_IF_DATA_DEGRADED;
+
+		if (dflags & ~flags) {
+			if (print) {
+				struct printbuf buf = PRINTBUF;
+
+				bch2_replicas_entry_to_text(&buf, e);
+				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+					nr_online, buf.buf);
+				printbuf_exit(&buf);
+			}
+			ret = false;
+			break;
+		}
+
+	}
+	percpu_up_read(&c->mark_lock);
+
+	return ret;
+}
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
+{
+	struct bch_sb_field_replicas *replicas;
+	struct bch_sb_field_replicas_v0 *replicas_v0;
+	unsigned i, data_has = 0;
+
+	replicas = bch2_sb_field_get(sb, replicas);
+	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
+
+	if (replicas) {
+		struct bch_replicas_entry *r;
+
+		for_each_replicas_entry(replicas, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	} else if (replicas_v0) {
+		struct bch_replicas_entry_v0 *r;
+
+		for_each_replicas_entry_v0(replicas_v0, r)
+			for (i = 0; i < r->nr_devs; i++)
+				if (r->devs[i] == dev)
+					data_has |= 1 << r->data_type;
+	}
+
+
+	return data_has;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	unsigned ret;
+
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	kfree(c->usage_scratch);
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		free_percpu(c->usage[i]);
+	kfree(c->usage_base);
+	kfree(c->replicas.entries);
+	kfree(c->replicas_gc.entries);
+
+	mempool_exit(&c->replicas_delta_pool);
+}
+
+int bch2_fs_replicas_init(struct bch_fs *c)
+{
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->replicas_journal_res,
+			reserve_journal_replicas(c, &c->replicas));
+
+	return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
+					 REPLICAS_DELTA_LIST_MAX) ?:
+		replicas_table_update(c, &c->replicas);
+}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
new file mode 100644
index 000000000000..f70a642775d1
--- /dev/null
+++ b/fs/bcachefs/replicas.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+#include "bkey.h"
+#include "eytzinger.h"
+#include "replicas_types.h"
+
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry *,
+				 struct bch_sb *, struct printbuf *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry *);
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+	return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_cached;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
+
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+			   unsigned, bool);
+
+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+int bch2_replicas_gc2(struct bch_fs *);
+
+int bch2_replicas_set_usage(struct bch_fs *,
+			    struct bch_replicas_entry *,
+			    u64);
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+#define replicas_entry_next(_i)						\
+	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+#define for_each_replicas_entry_v0(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
+
+void bch2_fs_replicas_exit(struct bch_fs *);
+int bch2_fs_replicas_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
new file mode 100644
index 000000000000..5cfff489bbc3
--- /dev/null
+++ b/fs/bcachefs/replicas_types.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
+#define _BCACHEFS_REPLICAS_TYPES_H
+
+struct bch_replicas_cpu {
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_entry *entries;
+};
+
+struct replicas_delta {
+	s64			delta;
+	struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+	unsigned		size;
+	unsigned		used;
+
+	struct			{} memset_start;
+	u64			nr_inodes;
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	struct			{} memset_end;
+	struct replicas_delta	d[0];
+};
+
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
new file mode 100644
index 000000000000..c76ad8ea5e4a
--- /dev/null
+++ b/fs/bcachefs/sb-clean.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+				int write)
+{
+	struct jset_entry *entry;
+	int ret;
+
+	for (entry = clean->start;
+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		ret = bch2_journal_entry_validate(c, NULL, entry,
+						  le16_to_cpu(c->disk_sb.sb->version),
+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+						  write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+				 struct bch_sb_field_clean **cleanp,
+				 struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	struct printbuf buf1 = PRINTBUF;
+	struct printbuf buf2 = PRINTBUF;
+	int ret = 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			sb_clean_journal_seq_mismatch,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		printbuf_reset(&buf1);
+		printbuf_reset(&buf2);
+
+		if (k1)
+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+		else
+			prt_printf(&buf1, "(none)");
+
+		if (k2)
+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+		else
+			prt_printf(&buf2, "(none)");
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+				    l1 != l2, c,
+			sb_clean_btree_root_mismatch,
+			"superblock btree root %u doesn't match journal after clean shutdown\n"
+			"sb:      l=%u %s\n"
+			"journal: l=%u %s\n", i,
+			l1, buf1.buf,
+			l2, buf2.buf);
+	}
+fsck_err:
+	printbuf_exit(&buf2);
+	printbuf_exit(&buf1);
+	return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
+
+	if (fsck_err_on(!sb_clean, c,
+			sb_clean_missing,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+	}
+
+	ret = bch2_sb_clean_validate_late(c, clean, READ);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(ret);
+	}
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
+{
+	struct bch_dev *ca;
+	unsigned i, dev;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (!journal_seq) {
+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+			bch2_fs_usage_acc_to_base(c, i);
+	} else {
+		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_inodes;
+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
+	}
+
+	{
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_key_version;
+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
+	}
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		struct jset_entry_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_usage;
+		u->entry.btree_id = BCH_FS_USAGE_reserved;
+		u->entry.level	= i;
+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
+	}
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct jset_entry_data_usage *u =
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);
+
+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
+		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
+	}
+
+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+	}
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+				  struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+		prt_printf(err, "wrong size (got %zu should be %zu)",
+		       vstruct_bytes(&clean->field), sizeof(*clean));
+		return -BCH_ERR_invalid_sb_clean;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+				  struct bch_sb_field *f)
+{
+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
+	struct jset_entry *entry;
+
+	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
+	prt_newline(out);
+	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
+	prt_newline(out);
+
+	for (entry = clean->start;
+	     entry != vstruct_end(&clean->field);
+	     entry = vstruct_next(entry)) {
+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+		    !entry->u64s)
+			continue;
+
+		bch2_journal_entry_to_text(out, NULL, entry);
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+	.validate	= bch2_sb_clean_validate,
+	.to_text	= bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Unconditionally write superblock, to verify it hasn't changed before
+	 * we go rw:
+	 */
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *sb_clean;
+	struct jset_entry *entry;
+	unsigned u64s;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (BCH_SB_CLEAN(c->disk_sb.sb))
+		goto out;
+
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+	sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
+	if (!sb_clean) {
+		bch_err(c, "error resizing superblock while setting filesystem clean");
+		goto out;
+	}
+
+	sb_clean->flags		= 0;
+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
+
+	/* Trying to catch outstanding bug: */
+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+	entry = sb_clean->start;
+	bch2_journal_super_entries_add_common(c, &entry, 0);
+	entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+	memset(entry, 0,
+	       vstruct_end(&sb_clean->field) - (void *) entry);
+
+	/*
+	 * this should be in the write path, and we should be validating every
+	 * superblock section:
+	 */
+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+	if (ret) {
+		bch_err(c, "error writing marking filesystem clean: validate error");
+		goto out;
+	}
+
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
new file mode 100644
index 000000000000..71caef281239
--- /dev/null
+++ b/fs/bcachefs/sb-clean.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+				 struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
new file mode 100644
index 000000000000..4919237bbe73
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Superblock section that contains a list of recovery passes to run when
+ * downgrading past a given version
+ */
+
+#include "bcachefs.h"
+#include "darray.h"
+#include "recovery.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+/*
+ * Downgrade table:
+ * When dowgrading past certain versions, we need to run certain recovery passes
+ * and fix certain errors:
+ *
+ * x(version, recovery_passes, errors...)
+ */
+
+#define DOWNGRADE_TABLE()
+
+struct downgrade_entry {
+	u64		recovery_passes;
+	u16		version;
+	u16		nr_errors;
+	const u16	*errors;
+};
+
+#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ };
+DOWNGRADE_TABLE()
+#undef x
+
+static const struct downgrade_entry downgrade_table[] = {
+#define x(ver, passes, ...) {					\
+	.recovery_passes	= passes,			\
+	.version		= bcachefs_metadata_version_##ver,\
+	.nr_errors		= ARRAY_SIZE(ver_##errors),	\
+	.errors			= ver_##errors,			\
+},
+DOWNGRADE_TABLE()
+#undef x
+};
+
+static inline const struct bch_sb_field_downgrade_entry *
+downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
+{
+	return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
+}
+
+#define for_each_downgrade_entry(_d, _i)						\
+	for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries;		\
+	     (void *) _i	< vstruct_end(&(_d)->field) &&				\
+	     (void *) &_i->errors[0] < vstruct_end(&(_d)->field);			\
+	     _i = downgrade_entry_next_c(_i))
+
+static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				      struct printbuf *err)
+{
+	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+	for_each_downgrade_entry(e, i) {
+		if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
+		    BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
+			prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
+				   BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
+				   BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
+			return -BCH_ERR_invalid_sb_downgrade;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
+				      struct bch_sb_field *f)
+{
+	struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
+
+	if (out->nr_tabstops <= 1)
+		printbuf_tabstop_push(out, 16);
+
+	for_each_downgrade_entry(e, i) {
+		prt_str(out, "version:");
+		prt_tab(out);
+		bch2_version_to_text(out, le16_to_cpu(i->version));
+		prt_newline(out);
+
+		prt_str(out, "recovery passes:");
+		prt_tab(out);
+		prt_bitflags(out, bch2_recovery_passes,
+			     bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
+		prt_newline(out);
+
+		prt_str(out, "errors:");
+		prt_tab(out);
+		bool first = true;
+		for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+			if (!first)
+				prt_char(out, ',');
+			first = false;
+			unsigned e = le16_to_cpu(i->errors[j]);
+			prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+		}
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
+	.validate	= bch2_sb_downgrade_validate,
+	.to_text	= bch2_sb_downgrade_to_text,
+};
+
+int bch2_sb_downgrade_update(struct bch_fs *c)
+{
+	darray_char table = {};
+	int ret = 0;
+
+	for (const struct downgrade_entry *src = downgrade_table;
+	     src < downgrade_table + ARRAY_SIZE(downgrade_table);
+	     src++) {
+		if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+			continue;
+
+		struct bch_sb_field_downgrade_entry *dst;
+		unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
+
+		ret = darray_make_room(&table, bytes);
+		if (ret)
+			goto out;
+
+		dst = (void *) &darray_top(table);
+		dst->version = cpu_to_le16(src->version);
+		dst->recovery_passes[0]	= cpu_to_le64(src->recovery_passes);
+		dst->recovery_passes[1]	= 0;
+		dst->nr_errors		= cpu_to_le16(src->nr_errors);
+		for (unsigned i = 0; i < src->nr_errors; i++)
+			dst->errors[i] = cpu_to_le16(src->errors[i]);
+
+		table.nr += bytes;
+	}
+
+	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+
+	unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
+
+	if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
+		goto out;
+
+	d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
+	if (!d) {
+		ret = -BCH_ERR_ENOSPC_sb_downgrade;
+		goto out;
+	}
+
+	memcpy(d->entries, table.data, table.nr);
+	memset_u64s_tail(d->entries, 0, table.nr);
+out:
+	darray_exit(&table);
+	return ret;
+}
+
+void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
+{
+	struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
+	if (!d)
+		return;
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	for_each_downgrade_entry(d, i) {
+		unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
+		if (new_minor < minor && minor <= old_minor) {
+			ext->recovery_passes_required[0] |= i->recovery_passes[0];
+			ext->recovery_passes_required[1] |= i->recovery_passes[1];
+
+			for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
+				unsigned e = le16_to_cpu(i->errors[j]);
+				if (e < BCH_SB_ERR_MAX)
+					__set_bit(e, c->sb.errors_silent);
+				if (e < sizeof(ext->errors_silent) * 8)
+					ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+			}
+		}
+	}
+}
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
new file mode 100644
index 000000000000..bc48fd2ca70e
--- /dev/null
+++ b/fs/bcachefs/sb-downgrade.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_DOWNGRADE_H
+#define _BCACHEFS_SB_DOWNGRADE_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
+
+int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
+
+#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
new file mode 100644
index 000000000000..5f5bcae391fb
--- /dev/null
+++ b/fs/bcachefs/sb-errors.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+const char * const bch2_sb_error_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_SB_ERRS()
+	NULL
+};
+
+static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+{
+	if (id < BCH_SB_ERR_MAX)
+		prt_str(out, bch2_sb_error_strs[id]);
+	else
+		prt_printf(out, "(unknown error %u)", id);
+}
+
+static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
+{
+	return bch2_sb_field_nr_entries(e);
+}
+
+static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
+{
+	return (sizeof(struct bch_sb_field_errors) +
+		sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
+}
+
+static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				   struct printbuf *err)
+{
+	struct bch_sb_field_errors *e = field_to_type(f, errors);
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+	for (i = 0; i < nr; i++) {
+		if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
+			prt_printf(err, "entry with count 0 (id ");
+			bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+			prt_printf(err, ")");
+			return -BCH_ERR_invalid_sb_errors;
+		}
+
+		if (i + 1 < nr &&
+		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
+		    BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
+			prt_printf(err, "entries out of order");
+			return -BCH_ERR_invalid_sb_errors;
+		}
+	}
+
+	return 0;
+}
+
+static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
+				   struct bch_sb_field *f)
+{
+	struct bch_sb_field_errors *e = field_to_type(f, errors);
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+	if (out->nr_tabstops <= 1)
+		printbuf_tabstop_push(out, 16);
+
+	for (i = 0; i < nr; i++) {
+		bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+		prt_tab(out);
+		prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
+		prt_tab(out);
+		bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
+		prt_newline(out);
+	}
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_errors = {
+	.validate	= bch2_sb_errors_validate,
+	.to_text	= bch2_sb_errors_to_text,
+};
+
+void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
+{
+	bch_sb_errors_cpu *e = &c->fsck_error_counts;
+	struct bch_sb_error_entry_cpu n = {
+		.id = err,
+		.nr = 1,
+		.last_error_time = ktime_get_real_seconds()
+	};
+	unsigned i;
+
+	mutex_lock(&c->fsck_error_counts_lock);
+	for (i = 0; i < e->nr; i++) {
+		if (err == e->data[i].id) {
+			e->data[i].nr++;
+			e->data[i].last_error_time = n.last_error_time;
+			goto out;
+		}
+		if (err < e->data[i].id)
+			break;
+	}
+
+	if (darray_make_room(e, 1))
+		goto out;
+
+	darray_insert_item(e, i, n);
+out:
+	mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+void bch2_sb_errors_from_cpu(struct bch_fs *c)
+{
+	bch_sb_errors_cpu *src = &c->fsck_error_counts;
+	struct bch_sb_field_errors *dst =
+		bch2_sb_field_resize(&c->disk_sb, errors,
+				     bch2_sb_field_errors_u64s(src->nr));
+	unsigned i;
+
+	if (!dst)
+		return;
+
+	for (i = 0; i < src->nr; i++) {
+		SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
+		SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
+		dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
+	}
+}
+
+static int bch2_sb_errors_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
+	bch_sb_errors_cpu *dst = &c->fsck_error_counts;
+	unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
+	int ret;
+
+	if (!nr)
+		return 0;
+
+	mutex_lock(&c->fsck_error_counts_lock);
+	ret = darray_make_room(dst, nr);
+	if (ret)
+		goto err;
+
+	dst->nr = nr;
+
+	for (i = 0; i < nr; i++) {
+		dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
+		dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
+		dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
+	}
+err:
+	mutex_unlock(&c->fsck_error_counts_lock);
+
+	return ret;
+}
+
+void bch2_fs_sb_errors_exit(struct bch_fs *c)
+{
+	darray_exit(&c->fsck_error_counts);
+}
+
+void bch2_fs_sb_errors_init_early(struct bch_fs *c)
+{
+	mutex_init(&c->fsck_error_counts_lock);
+	darray_init(&c->fsck_error_counts);
+}
+
+int bch2_fs_sb_errors_init(struct bch_fs *c)
+{
+	return bch2_sb_errors_to_cpu(c);
+}
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
new file mode 100644
index 000000000000..8889001e7db4
--- /dev/null
+++ b/fs/bcachefs/sb-errors.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_H
+#define _BCACHEFS_SB_ERRORS_H
+
+#include "sb-errors_types.h"
+
+extern const char * const bch2_sb_error_strs[];
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
+
+void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
+
+void bch2_sb_errors_from_cpu(struct bch_fs *);
+
+void bch2_fs_sb_errors_exit(struct bch_fs *);
+void bch2_fs_sb_errors_init_early(struct bch_fs *);
+int bch2_fs_sb_errors_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
new file mode 100644
index 000000000000..3504c2d09c29
--- /dev/null
+++ b/fs/bcachefs/sb-errors_types.h
@@ -0,0 +1,269 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
+#define _BCACHEFS_SB_ERRORS_TYPES_H
+
+#include "darray.h"
+
+#define BCH_SB_ERRS()							\
+	x(clean_but_journal_not_empty,				0)	\
+	x(dirty_but_no_journal_entries,				1)	\
+	x(dirty_but_no_journal_entries_post_drop_nonflushes,	2)	\
+	x(sb_clean_journal_seq_mismatch,			3)	\
+	x(sb_clean_btree_root_mismatch,				4)	\
+	x(sb_clean_missing,					5)	\
+	x(jset_unsupported_version,				6)	\
+	x(jset_unknown_csum,					7)	\
+	x(jset_last_seq_newer_than_seq,				8)	\
+	x(jset_past_bucket_end,					9)	\
+	x(jset_seq_blacklisted,					10)	\
+	x(journal_entries_missing,				11)	\
+	x(journal_entry_replicas_not_marked,			12)	\
+	x(journal_entry_past_jset_end,				13)	\
+	x(journal_entry_replicas_data_mismatch,			14)	\
+	x(journal_entry_bkey_u64s_0,				15)	\
+	x(journal_entry_bkey_past_end,				16)	\
+	x(journal_entry_bkey_bad_format,			17)	\
+	x(journal_entry_bkey_invalid,				18)	\
+	x(journal_entry_btree_root_bad_size,			19)	\
+	x(journal_entry_blacklist_bad_size,			20)	\
+	x(journal_entry_blacklist_v2_bad_size,			21)	\
+	x(journal_entry_blacklist_v2_start_past_end,		22)	\
+	x(journal_entry_usage_bad_size,				23)	\
+	x(journal_entry_data_usage_bad_size,			24)	\
+	x(journal_entry_clock_bad_size,				25)	\
+	x(journal_entry_clock_bad_rw,				26)	\
+	x(journal_entry_dev_usage_bad_size,			27)	\
+	x(journal_entry_dev_usage_bad_dev,			28)	\
+	x(journal_entry_dev_usage_bad_pad,			29)	\
+	x(btree_node_unreadable,				30)	\
+	x(btree_node_fault_injected,				31)	\
+	x(btree_node_bad_magic,					32)	\
+	x(btree_node_bad_seq,					33)	\
+	x(btree_node_unsupported_version,			34)	\
+	x(btree_node_bset_older_than_sb_min,			35)	\
+	x(btree_node_bset_newer_than_sb,			36)	\
+	x(btree_node_data_missing,				37)	\
+	x(btree_node_bset_after_end,				38)	\
+	x(btree_node_replicas_sectors_written_mismatch,		39)	\
+	x(btree_node_replicas_data_mismatch,			40)	\
+	x(bset_unknown_csum,					41)	\
+	x(bset_bad_csum,					42)	\
+	x(bset_past_end_of_btree_node,				43)	\
+	x(bset_wrong_sector_offset,				44)	\
+	x(bset_empty,						45)	\
+	x(bset_bad_seq,						46)	\
+	x(bset_blacklisted_journal_seq,				47)	\
+	x(first_bset_blacklisted_journal_seq,			48)	\
+	x(btree_node_bad_btree,					49)	\
+	x(btree_node_bad_level,					50)	\
+	x(btree_node_bad_min_key,				51)	\
+	x(btree_node_bad_max_key,				52)	\
+	x(btree_node_bad_format,				53)	\
+	x(btree_node_bkey_past_bset_end,			54)	\
+	x(btree_node_bkey_bad_format,				55)	\
+	x(btree_node_bad_bkey,					56)	\
+	x(btree_node_bkey_out_of_order,				57)	\
+	x(btree_root_bkey_invalid,				58)	\
+	x(btree_root_read_error,				59)	\
+	x(btree_root_bad_min_key,				60)	\
+	x(btree_root_bad_max_key,				61)	\
+	x(btree_node_read_error,				62)	\
+	x(btree_node_topology_bad_min_key,			63)	\
+	x(btree_node_topology_bad_max_key,			64)	\
+	x(btree_node_topology_overwritten_by_prev_node,		65)	\
+	x(btree_node_topology_overwritten_by_next_node,		66)	\
+	x(btree_node_topology_interior_node_empty,		67)	\
+	x(fs_usage_hidden_wrong,				68)	\
+	x(fs_usage_btree_wrong,					69)	\
+	x(fs_usage_data_wrong,					70)	\
+	x(fs_usage_cached_wrong,				71)	\
+	x(fs_usage_reserved_wrong,				72)	\
+	x(fs_usage_persistent_reserved_wrong,			73)	\
+	x(fs_usage_nr_inodes_wrong,				74)	\
+	x(fs_usage_replicas_wrong,				75)	\
+	x(dev_usage_buckets_wrong,				76)	\
+	x(dev_usage_sectors_wrong,				77)	\
+	x(dev_usage_fragmented_wrong,				78)	\
+	x(dev_usage_buckets_ec_wrong,				79)	\
+	x(bkey_version_in_future,				80)	\
+	x(bkey_u64s_too_small,					81)	\
+	x(bkey_invalid_type_for_btree,				82)	\
+	x(bkey_extent_size_zero,				83)	\
+	x(bkey_extent_size_greater_than_offset,			84)	\
+	x(bkey_size_nonzero,					85)	\
+	x(bkey_snapshot_nonzero,				86)	\
+	x(bkey_snapshot_zero,					87)	\
+	x(bkey_at_pos_max,					88)	\
+	x(bkey_before_start_of_btree_node,			89)	\
+	x(bkey_after_end_of_btree_node,				90)	\
+	x(bkey_val_size_nonzero,				91)	\
+	x(bkey_val_size_too_small,				92)	\
+	x(alloc_v1_val_size_bad,				93)	\
+	x(alloc_v2_unpack_error,				94)	\
+	x(alloc_v3_unpack_error,				95)	\
+	x(alloc_v4_val_size_bad,				96)	\
+	x(alloc_v4_backpointers_start_bad,			97)	\
+	x(alloc_key_data_type_bad,				98)	\
+	x(alloc_key_empty_but_have_data,			99)	\
+	x(alloc_key_dirty_sectors_0,				100)	\
+	x(alloc_key_data_type_inconsistency,			101)	\
+	x(alloc_key_to_missing_dev_bucket,			102)	\
+	x(alloc_key_cached_inconsistency,			103)	\
+	x(alloc_key_cached_but_read_time_zero,			104)	\
+	x(alloc_key_to_missing_lru_entry,			105)	\
+	x(alloc_key_data_type_wrong,				106)	\
+	x(alloc_key_gen_wrong,					107)	\
+	x(alloc_key_dirty_sectors_wrong,			108)	\
+	x(alloc_key_cached_sectors_wrong,			109)	\
+	x(alloc_key_stripe_wrong,				110)	\
+	x(alloc_key_stripe_redundancy_wrong,			111)	\
+	x(bucket_sector_count_overflow,				112)	\
+	x(bucket_metadata_type_mismatch,			113)	\
+	x(need_discard_key_wrong,				114)	\
+	x(freespace_key_wrong,					115)	\
+	x(freespace_hole_missing,				116)	\
+	x(bucket_gens_val_size_bad,				117)	\
+	x(bucket_gens_key_wrong,				118)	\
+	x(bucket_gens_hole_wrong,				119)	\
+	x(bucket_gens_to_invalid_dev,				120)	\
+	x(bucket_gens_to_invalid_buckets,			121)	\
+	x(bucket_gens_nonzero_for_invalid_buckets,		122)	\
+	x(need_discard_freespace_key_to_invalid_dev_bucket,	123)	\
+	x(need_discard_freespace_key_bad,			124)	\
+	x(backpointer_pos_wrong,				125)	\
+	x(backpointer_to_missing_device,			126)	\
+	x(backpointer_to_missing_alloc,				127)	\
+	x(backpointer_to_missing_ptr,				128)	\
+	x(lru_entry_at_time_0,					129)	\
+	x(lru_entry_to_invalid_bucket,				130)	\
+	x(lru_entry_bad,					131)	\
+	x(btree_ptr_val_too_big,				132)	\
+	x(btree_ptr_v2_val_too_big,				133)	\
+	x(btree_ptr_has_non_ptr,				134)	\
+	x(extent_ptrs_invalid_entry,				135)	\
+	x(extent_ptrs_no_ptrs,					136)	\
+	x(extent_ptrs_too_many_ptrs,				137)	\
+	x(extent_ptrs_redundant_crc,				138)	\
+	x(extent_ptrs_redundant_stripe,				139)	\
+	x(extent_ptrs_unwritten,				140)	\
+	x(extent_ptrs_written_and_unwritten,			141)	\
+	x(ptr_to_invalid_device,				142)	\
+	x(ptr_to_duplicate_device,				143)	\
+	x(ptr_after_last_bucket,				144)	\
+	x(ptr_before_first_bucket,				145)	\
+	x(ptr_spans_multiple_buckets,				146)	\
+	x(ptr_to_missing_backpointer,				147)	\
+	x(ptr_to_missing_alloc_key,				148)	\
+	x(ptr_to_missing_replicas_entry,			149)	\
+	x(ptr_to_missing_stripe,				150)	\
+	x(ptr_to_incorrect_stripe,				151)	\
+	x(ptr_gen_newer_than_bucket_gen,			152)	\
+	x(ptr_too_stale,					153)	\
+	x(stale_dirty_ptr,					154)	\
+	x(ptr_bucket_data_type_mismatch,			155)	\
+	x(ptr_cached_and_erasure_coded,				156)	\
+	x(ptr_crc_uncompressed_size_too_small,			157)	\
+	x(ptr_crc_csum_type_unknown,				158)	\
+	x(ptr_crc_compression_type_unknown,			159)	\
+	x(ptr_crc_redundant,					160)	\
+	x(ptr_crc_uncompressed_size_too_big,			161)	\
+	x(ptr_crc_nonce_mismatch,				162)	\
+	x(ptr_stripe_redundant,					163)	\
+	x(reservation_key_nr_replicas_invalid,			164)	\
+	x(reflink_v_refcount_wrong,				165)	\
+	x(reflink_p_to_missing_reflink_v,			166)	\
+	x(stripe_pos_bad,					167)	\
+	x(stripe_val_size_bad,					168)	\
+	x(stripe_sector_count_wrong,				169)	\
+	x(snapshot_tree_pos_bad,				170)	\
+	x(snapshot_tree_to_missing_snapshot,			171)	\
+	x(snapshot_tree_to_missing_subvol,			172)	\
+	x(snapshot_tree_to_wrong_subvol,			173)	\
+	x(snapshot_tree_to_snapshot_subvol,			174)	\
+	x(snapshot_pos_bad,					175)	\
+	x(snapshot_parent_bad,					176)	\
+	x(snapshot_children_not_normalized,			177)	\
+	x(snapshot_child_duplicate,				178)	\
+	x(snapshot_child_bad,					179)	\
+	x(snapshot_skiplist_not_normalized,			180)	\
+	x(snapshot_skiplist_bad,				181)	\
+	x(snapshot_should_not_have_subvol,			182)	\
+	x(snapshot_to_bad_snapshot_tree,			183)	\
+	x(snapshot_bad_depth,					184)	\
+	x(snapshot_bad_skiplist,				185)	\
+	x(subvol_pos_bad,					186)	\
+	x(subvol_not_master_and_not_snapshot,			187)	\
+	x(subvol_to_missing_root,				188)	\
+	x(subvol_root_wrong_bi_subvol,				189)	\
+	x(bkey_in_missing_snapshot,				190)	\
+	x(inode_pos_inode_nonzero,				191)	\
+	x(inode_pos_blockdev_range,				192)	\
+	x(inode_unpack_error,					193)	\
+	x(inode_str_hash_invalid,				194)	\
+	x(inode_v3_fields_start_bad,				195)	\
+	x(inode_snapshot_mismatch,				196)	\
+	x(inode_unlinked_but_clean,				197)	\
+	x(inode_unlinked_but_nlink_nonzero,			198)	\
+	x(inode_checksum_type_invalid,				199)	\
+	x(inode_compression_type_invalid,			200)	\
+	x(inode_subvol_root_but_not_dir,			201)	\
+	x(inode_i_size_dirty_but_clean,				202)	\
+	x(inode_i_sectors_dirty_but_clean,			203)	\
+	x(inode_i_sectors_wrong,				204)	\
+	x(inode_dir_wrong_nlink,				205)	\
+	x(inode_dir_multiple_links,				206)	\
+	x(inode_multiple_links_but_nlink_0,			207)	\
+	x(inode_wrong_backpointer,				208)	\
+	x(inode_wrong_nlink,					209)	\
+	x(inode_unreachable,					210)	\
+	x(deleted_inode_but_clean,				211)	\
+	x(deleted_inode_missing,				212)	\
+	x(deleted_inode_is_dir,					213)	\
+	x(deleted_inode_not_unlinked,				214)	\
+	x(extent_overlapping,					215)	\
+	x(extent_in_missing_inode,				216)	\
+	x(extent_in_non_reg_inode,				217)	\
+	x(extent_past_end_of_inode,				218)	\
+	x(dirent_empty_name,					219)	\
+	x(dirent_val_too_big,					220)	\
+	x(dirent_name_too_long,					221)	\
+	x(dirent_name_embedded_nul,				222)	\
+	x(dirent_name_dot_or_dotdot,				223)	\
+	x(dirent_name_has_slash,				224)	\
+	x(dirent_d_type_wrong,					225)	\
+	x(dirent_d_parent_subvol_wrong,				226)	\
+	x(dirent_in_missing_dir_inode,				227)	\
+	x(dirent_in_non_dir_inode,				228)	\
+	x(dirent_to_missing_inode,				229)	\
+	x(dirent_to_missing_subvol,				230)	\
+	x(dirent_to_itself,					231)	\
+	x(quota_type_invalid,					232)	\
+	x(xattr_val_size_too_small,				233)	\
+	x(xattr_val_size_too_big,				234)	\
+	x(xattr_invalid_type,					235)	\
+	x(xattr_name_invalid_chars,				236)	\
+	x(xattr_in_missing_inode,				237)	\
+	x(root_subvol_missing,					238)	\
+	x(root_dir_missing,					239)	\
+	x(root_inode_not_dir,					240)	\
+	x(dir_loop,						241)	\
+	x(hash_table_key_duplicate,				242)	\
+	x(hash_table_key_wrong_offset,				243)
+
+enum bch_sb_error_id {
+#define x(t, n) BCH_FSCK_ERR_##t = n,
+	BCH_SB_ERRS()
+#undef x
+	BCH_SB_ERR_MAX
+};
+
+struct bch_sb_error_entry_cpu {
+	u64			id:16,
+				nr:48;
+	u64			last_error_time;
+};
+
+typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
+
+#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
+
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
new file mode 100644
index 000000000000..bed0f857fe5b
--- /dev/null
+++ b/fs/bcachefs/sb-members.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#define x(t, n, ...) [n] = #t,
+static const char * const bch2_iops_measurements[] = {
+	BCH_IOPS_MEASUREMENTS()
+	NULL
+};
+
+char * const bch2_member_error_strs[] = {
+	BCH_MEMBER_ERROR_TYPES()
+	NULL
+};
+#undef x
+
+/* Code for bch_sb_field_members_v1: */
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+	return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+	struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+	return ret;
+}
+
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
+{
+	return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
+{
+	struct bch_member ret, *p = members_v1_get_mut(mi, i);
+	memset(&ret, 0, sizeof(ret));
+	memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
+	return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+	struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
+	if (mi2)
+		return members_v2_get(mi2, i);
+	struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
+	return members_v1_get(mi1, i);
+}
+
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+	if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+		unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+					      c->disk_sb.sb->nr_devices), 8);
+
+		mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+		if (!mi)
+			return -BCH_ERR_ENOSPC_sb_members_v2;
+
+		for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+			void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+			memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+			memset(dst + le16_to_cpu(mi->member_bytes),
+			       0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+		}
+		mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+	}
+	return 0;
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+		mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
+				DIV_ROUND_UP(sizeof(*mi2) +
+					     sizeof(struct bch_member) * c->sb.nr_devices,
+					     sizeof(u64)));
+		mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
+		memcpy(&mi2->_members[0], &mi1->_members[0],
+		       BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+		memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+		mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+	}
+
+	return sb_members_v2_resize_entries(c);
+}
+
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+	struct bch_sb_field_members_v1 *mi1;
+	struct bch_sb_field_members_v2 *mi2;
+
+	mi1 = bch2_sb_field_resize(disk_sb, members_v1,
+			DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+				     disk_sb->sb->nr_devices, sizeof(u64)));
+	if (!mi1)
+		return -BCH_ERR_ENOSPC_sb_members;
+
+	mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
+
+	for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+		memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+	return 0;
+}
+
+static int validate_member(struct printbuf *err,
+			   struct bch_member m,
+			   struct bch_sb *sb,
+			   int i)
+{
+	if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
+		prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+			   i, le64_to_cpu(m.nbuckets), LONG_MAX);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le64_to_cpu(m.nbuckets) -
+	    le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+		prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+			   i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    le16_to_cpu(sb->block_size)) {
+		prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+			   i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(sb)) {
+		prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+			   i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+			   struct bch_member m,
+			   struct bch_sb_field_disk_groups *gi,
+			   struct bch_sb *sb,
+			   int i)
+{
+	unsigned data_have = bch2_sb_dev_has_data(sb, i);
+	u64 bucket_size = le16_to_cpu(m.bucket_size);
+	u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+	if (!bch2_member_exists(&m))
+		return;
+
+	prt_printf(out, "Device:");
+	prt_tab(out);
+	prt_printf(out, "%u", i);
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+
+	prt_printf(out, "Label:");
+	prt_tab(out);
+	if (BCH_MEMBER_GROUP(&m)) {
+		unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+		if (idx < disk_groups_nr(gi))
+			prt_printf(out, "%s (%u)",
+				   gi->entries[idx].label, idx);
+		else
+			prt_printf(out, "(bad disk labels section)");
+	} else {
+		prt_printf(out, "(none)");
+	}
+	prt_newline(out);
+
+	prt_printf(out, "UUID:");
+	prt_tab(out);
+	pr_uuid(out, m.uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Size:");
+	prt_tab(out);
+	prt_units_u64(out, device_size << 9);
+	prt_newline(out);
+
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+		prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
+		prt_tab(out);
+		prt_u64(out, le64_to_cpu(m.errors[i]));
+		prt_newline(out);
+	}
+
+	for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
+		prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
+		prt_tab(out);
+		prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
+		prt_newline(out);
+	}
+
+	prt_printf(out, "Bucket size:");
+	prt_tab(out);
+	prt_units_u64(out, bucket_size << 9);
+	prt_newline(out);
+
+	prt_printf(out, "First bucket:");
+	prt_tab(out);
+	prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
+	prt_newline(out);
+
+	prt_printf(out, "Buckets:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
+	prt_newline(out);
+
+	prt_printf(out, "Last mount:");
+	prt_tab(out);
+	if (m.last_mount)
+		bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
+	else
+		prt_printf(out, "(never)");
+	prt_newline(out);
+
+	prt_printf(out, "State:");
+	prt_tab(out);
+	prt_printf(out, "%s",
+		   BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+		   ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+		   : "unknown");
+	prt_newline(out);
+
+	prt_printf(out, "Data allowed:");
+	prt_tab(out);
+	if (BCH_MEMBER_DATA_ALLOWED(&m))
+		prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Has data:");
+	prt_tab(out);
+	if (data_have)
+		prt_bitflags(out, bch2_data_types, data_have);
+	else
+		prt_printf(out, "(none)");
+	prt_newline(out);
+
+	prt_printf(out, "Discard:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+	prt_newline(out);
+
+	prt_printf(out, "Freespace initialized:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+	prt_newline(out);
+
+	printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb,
+				    struct bch_sb_field *f,
+				    struct printbuf *err)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	unsigned i;
+
+	if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
+		prt_printf(err, "too many devices for section size");
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (i = 0; i < sb->nr_devices; i++) {
+		struct bch_member m = members_v1_get(mi, i);
+
+		int ret = validate_member(err, m, sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++)
+		member_to_text(out, members_v1_get(mi, i), gi, sb, i);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
+	.validate	= bch2_sb_members_v1_validate,
+	.to_text	= bch2_sb_members_v1_to_text,
+};
+
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+				       struct bch_sb_field *f)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+	unsigned i;
+
+	for (i = 0; i < sb->nr_devices; i++)
+		member_to_text(out, members_v2_get(mi, i), gi, sb, i);
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb,
+				       struct bch_sb_field *f,
+				       struct printbuf *err)
+{
+	struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+	size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
+		(void *) mi;
+
+	if (mi_bytes > vstruct_bytes(&mi->field)) {
+		prt_printf(err, "section too small (%zu > %zu)",
+			   mi_bytes, vstruct_bytes(&mi->field));
+		return -BCH_ERR_invalid_sb_members;
+	}
+
+	for (unsigned i = 0; i < sb->nr_devices; i++) {
+		int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+	.validate	= bch2_sb_members_v2_validate,
+	.to_text	= bch2_sb_members_v2_to_text,
+};
+
+void bch2_sb_members_from_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	struct bch_dev *ca;
+	unsigned i, e;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+
+		for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+			m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
+	}
+	rcu_read_unlock();
+}
+
+void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_member m;
+
+	mutex_lock(&ca->fs->sb_lock);
+	m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+	mutex_unlock(&ca->fs->sb_lock);
+
+	printbuf_tabstop_push(out, 12);
+
+	prt_str(out, "IO errors since filesystem creation");
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+		prt_printf(out, "%s:", bch2_member_error_strs[i]);
+		prt_tab(out);
+		prt_u64(out, atomic64_read(&ca->errors[i]));
+		prt_newline(out);
+	}
+	printbuf_indent_sub(out, 2);
+
+	prt_str(out, "IO errors since ");
+	bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
+	prt_str(out, " ago");
+	prt_newline(out);
+
+	printbuf_indent_add(out, 2);
+	for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+		prt_printf(out, "%s:", bch2_member_error_strs[i]);
+		prt_tab(out);
+		prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
+		prt_newline(out);
+	}
+	printbuf_indent_sub(out, 2);
+}
+
+void bch2_dev_errors_reset(struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_member *m;
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
+		m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
+	m->errors_reset_time = ktime_get_real_seconds();
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
new file mode 100644
index 000000000000..03613e3eb8e3
--- /dev/null
+++ b/fs/bcachefs/sb-members.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+extern char * const bch2_member_error_strs[];
+
+static inline struct bch_member *
+__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
+{
+	return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c);
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+					 unsigned dev)
+{
+	if (!bch2_dev_list_has_dev(*devs, dev)) {
+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+		devs->devs[devs->nr++] = dev;
+	}
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+					      const struct bch_devs_mask *mask)
+{
+	struct bch_dev *ca = NULL;
+
+	while ((*iter = mask
+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+		: *iter) < c->sb.nr_devices &&
+	       !(ca = rcu_dereference_check(c->devs[*iter],
+					    lockdep_is_held(&c->state_lock))))
+		(*iter)++;
+
+	return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask)			\
+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	if ((ca = __bch2_next_dev(c, iter, NULL)))
+		percpu_ref_get(&ca->ref);
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)				\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_dev(c, &(iter)));			\
+	     percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+						      unsigned *iter,
+						      int state_mask)
+{
+	struct bch_dev *ca;
+
+	rcu_read_lock();
+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+	       (!((1 << ca->mi.state) & state_mask) ||
+		!percpu_ref_tryget(&ca->io_ref)))
+		(*iter)++;
+	rcu_read_unlock();
+
+	return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)		\
+	for ((iter) = 0;						\
+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
+	     percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)					\
+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter)				\
+	__for_each_online_member(ca, c, iter,				\
+		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+	return rcu_dereference_protected(c->devs[idx],
+					 lockdep_is_held(&c->sb_lock) ||
+					 lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+	return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+{
+	if (dev < sb->nr_devices) {
+		struct bch_member m = bch2_sb_member_get(sb, dev);
+		return bch2_member_exists(&m);
+	}
+	return false;
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+	return (struct bch_member_cpu) {
+		.nbuckets	= le64_to_cpu(mi->nbuckets),
+		.first_bucket	= le16_to_cpu(mi->first_bucket),
+		.bucket_size	= le16_to_cpu(mi->bucket_size),
+		.group		= BCH_MEMBER_GROUP(mi),
+		.state		= BCH_MEMBER_STATE(mi),
+		.discard	= BCH_MEMBER_DISCARD(mi),
+		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
+		.durability	= BCH_MEMBER_DURABILITY(mi)
+			? BCH_MEMBER_DURABILITY(mi) - 1
+			: 1,
+		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+		.valid		= bch2_member_exists(mi),
+	};
+}
+
+void bch2_sb_members_from_cpu(struct bch_fs *);
+
+void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
+void bch2_dev_errors_reset(struct bch_dev *);
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
new file mode 100644
index 000000000000..c1860d8163fb
--- /dev/null
+++ b/fs/bcachefs/seqmutex.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SEQMUTEX_H
+#define _BCACHEFS_SEQMUTEX_H
+
+#include <linux/mutex.h>
+
+struct seqmutex {
+	struct mutex	lock;
+	u32		seq;
+};
+
+#define seqmutex_init(_lock)	mutex_init(&(_lock)->lock)
+
+static inline bool seqmutex_trylock(struct seqmutex *lock)
+{
+	return mutex_trylock(&lock->lock);
+}
+
+static inline void seqmutex_lock(struct seqmutex *lock)
+{
+	mutex_lock(&lock->lock);
+}
+
+static inline void seqmutex_unlock(struct seqmutex *lock)
+{
+	lock->seq++;
+	mutex_unlock(&lock->lock);
+}
+
+static inline u32 seqmutex_seq(struct seqmutex *lock)
+{
+	return lock->seq;
+}
+
+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
+{
+	if (lock->seq != seq || !mutex_trylock(&lock->lock))
+		return false;
+
+	if (lock->seq != seq) {
+		mutex_unlock(&lock->lock);
+		return false;
+	}
+
+	return true;
+}
+
+#endif /* _BCACHEFS_SEQMUTEX_H */
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
new file mode 100644
index 000000000000..dc1a27cc31cd
--- /dev/null
+++ b/fs/bcachefs/siphash.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/*	$OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
+
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
+ * are the number of compression rounds and the number of finalization rounds.
+ * A compression round is identical to a finalization round and this round
+ * function is called SipRound.  Given a 128-bit key k and a (possibly empty)
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
+ *
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
+ * https://131002.net/siphash/siphash.pdf
+ * https://131002.net/siphash/
+ */
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#include "siphash.h"
+
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+	while (rounds--) {
+		ctx->v[0] += ctx->v[1];
+		ctx->v[2] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 13);
+		ctx->v[3] = rol64(ctx->v[3], 16);
+
+		ctx->v[1] ^= ctx->v[0];
+		ctx->v[3] ^= ctx->v[2];
+		ctx->v[0] = rol64(ctx->v[0], 32);
+
+		ctx->v[2] += ctx->v[1];
+		ctx->v[0] += ctx->v[3];
+		ctx->v[1] = rol64(ctx->v[1], 17);
+		ctx->v[3] = rol64(ctx->v[3], 21);
+
+		ctx->v[1] ^= ctx->v[2];
+		ctx->v[3] ^= ctx->v[0];
+		ctx->v[2] = rol64(ctx->v[2], 32);
+	}
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+	u64 m = get_unaligned_le64(ptr);
+
+	ctx->v[3] ^= m;
+	SipHash_Rounds(ctx, rounds);
+	ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+{
+	u64 k0, k1;
+
+	k0 = le64_to_cpu(key->k0);
+	k1 = le64_to_cpu(key->k1);
+
+	ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
+	ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
+	ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
+	ctx->v[3] = 0x7465646279746573ULL ^ k1;
+
+	memset(ctx->buf, 0, sizeof(ctx->buf));
+	ctx->bytes = 0;
+}
+
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+		    const void *src, size_t len)
+{
+	const u8 *ptr = src;
+	size_t left, used;
+
+	if (len == 0)
+		return;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	ctx->bytes += len;
+
+	if (used > 0) {
+		left = sizeof(ctx->buf) - used;
+
+		if (len >= left) {
+			memcpy(&ctx->buf[used], ptr, left);
+			SipHash_CRounds(ctx, ctx->buf, rc);
+			len -= left;
+			ptr += left;
+		} else {
+			memcpy(&ctx->buf[used], ptr, len);
+			return;
+		}
+	}
+
+	while (len >= sizeof(ctx->buf)) {
+		SipHash_CRounds(ctx, ptr, rc);
+		len -= sizeof(ctx->buf);
+		ptr += sizeof(ctx->buf);
+	}
+
+	if (len > 0)
+		memcpy(&ctx->buf[used], ptr, len);
+}
+
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+
+	r = SipHash_End(ctx, rc, rf);
+
+	*((__le64 *) dst) = cpu_to_le64(r);
+}
+
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+{
+	u64 r;
+	size_t left, used;
+
+	used = ctx->bytes % sizeof(ctx->buf);
+	left = sizeof(ctx->buf) - used;
+	memset(&ctx->buf[used], 0, left - 1);
+	ctx->buf[7] = ctx->bytes;
+
+	SipHash_CRounds(ctx, ctx->buf, rc);
+	ctx->v[2] ^= 0xff;
+	SipHash_Rounds(ctx, rf);
+
+	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
+	memset(ctx, 0, sizeof(*ctx));
+	return r;
+}
+
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+{
+	SIPHASH_CTX ctx;
+
+	SipHash_Init(&ctx, key);
+	SipHash_Update(&ctx, rc, rf, src, len);
+	return SipHash_End(&ctx, rc, rf);
+}
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
new file mode 100644
index 000000000000..3dfaf34a43b2
--- /dev/null
+++ b/fs/bcachefs/siphash.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
+/*-
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
+ * optimized for speed on short messages returning a 64bit hash/digest value.
+ *
+ * The number of rounds is defined during the initialization:
+ *  SipHash24_Init() for the fast and resonable strong version
+ *  SipHash48_Init() for the strong version (half as fast)
+ *
+ * struct SIPHASH_CTX ctx;
+ * SipHash24_Init(&ctx);
+ * SipHash_SetKey(&ctx, "16bytes long key");
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
+ * SipHash_Final(output, &ctx);
+ */
+
+#ifndef _SIPHASH_H_
+#define _SIPHASH_H_
+
+#include <linux/types.h>
+
+#define SIPHASH_BLOCK_LENGTH	 8
+#define SIPHASH_KEY_LENGTH	16
+#define SIPHASH_DIGEST_LENGTH	 8
+
+typedef struct _SIPHASH_CTX {
+	u64		v[4];
+	u8		buf[SIPHASH_BLOCK_LENGTH];
+	u32		bytes;
+} SIPHASH_CTX;
+
+typedef struct {
+	__le64		k0;
+	__le64		k1;
+} SIPHASH_KEY;
+
+void	SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
+void	SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
+u64	SipHash_End(SIPHASH_CTX *, int, int);
+void	SipHash_Final(void *, SIPHASH_CTX *, int, int);
+u64	SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
+
+#define SipHash24_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash24_Update(_c, _p, _l)	SipHash_Update((_c), 2, 4, (_p), (_l))
+#define SipHash24_End(_d)		SipHash_End((_d), 2, 4)
+#define SipHash24_Final(_d, _c)		SipHash_Final((_d), (_c), 2, 4)
+#define SipHash24(_k, _p, _l)		SipHash((_k), 2, 4, (_p), (_l))
+
+#define SipHash48_Init(_c, _k)		SipHash_Init((_c), (_k))
+#define SipHash48_Update(_c, _p, _l)	SipHash_Update((_c), 4, 8, (_p), (_l))
+#define SipHash48_End(_d)		SipHash_End((_d), 4, 8)
+#define SipHash48_Final(_d, _c)		SipHash_Final((_d), (_c), 4, 8)
+#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
+
+#endif /* _SIPHASH_H_ */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
new file mode 100644
index 000000000000..97790445e67a
--- /dev/null
+++ b/fs/bcachefs/six.c
@@ -0,0 +1,920 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include <trace/events/lock.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond)			BUG_ON(cond)
+#else
+#define EBUG_ON(cond)			do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)		lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
+
+struct six_lock_vals {
+	/* Value we add to the lock in order to take the lock: */
+	u32			lock_val;
+
+	/* If the lock has this value (used as a mask), taking the lock fails: */
+	u32			lock_fail;
+
+	/* Mask that indicates lock is held for this type: */
+	u32			held_mask;
+
+	/* Waitlist we wakeup when releasing the lock: */
+	enum six_lock_type	unlock_wakeup;
+};
+
+static const struct six_lock_vals l[] = {
+	[SIX_LOCK_read] = {
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
+		.lock_fail	= SIX_LOCK_HELD_write,
+		.held_mask	= SIX_LOCK_HELD_read,
+		.unlock_wakeup	= SIX_LOCK_write,
+	},
+	[SIX_LOCK_intent] = {
+		.lock_val	= SIX_LOCK_HELD_intent,
+		.lock_fail	= SIX_LOCK_HELD_intent,
+		.held_mask	= SIX_LOCK_HELD_intent,
+		.unlock_wakeup	= SIX_LOCK_intent,
+	},
+	[SIX_LOCK_write] = {
+		.lock_val	= SIX_LOCK_HELD_write,
+		.lock_fail	= SIX_LOCK_HELD_read,
+		.held_mask	= SIX_LOCK_HELD_write,
+		.unlock_wakeup	= SIX_LOCK_read,
+	},
+};
+
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+				 u32 old, struct task_struct *owner)
+{
+	if (type != SIX_LOCK_intent)
+		return;
+
+	if (!(old & SIX_LOCK_HELD_intent)) {
+		EBUG_ON(lock->owner);
+		lock->owner = owner;
+	} else {
+		EBUG_ON(lock->owner != current);
+	}
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+	unsigned read_count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
+}
+
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
+{
+	int ret;
+	u32 old;
+
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
+
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
+	 */
+	if (type == SIX_LOCK_read && lock->readers) {
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+		smp_mb();
+
+		old = atomic_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
+
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		if (!ret) {
+			smp_mb();
+			if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+				ret = -1 - SIX_LOCK_write;
+		}
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try) {
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
+			smp_mb__after_atomic();
+		}
+
+		ret = !pcpu_read_count(lock);
+
+		if (try && !ret) {
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
+				ret = -1 - SIX_LOCK_read;
+		}
+	} else {
+		old = atomic_read(&lock->state);
+		do {
+			ret = !(old & l[type].lock_fail);
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
+				break;
+			}
+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
+	}
+
+	if (ret > 0)
+		six_set_owner(lock, type, old, task);
+
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
+	return ret;
+}
+
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+	struct six_lock_waiter *w, *next;
+	struct task_struct *task;
+	bool saw_one;
+	int ret;
+again:
+	ret = 0;
+	saw_one = false;
+	raw_spin_lock(&lock->wait_lock);
+
+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
+
+		if (saw_one && lock_type != SIX_LOCK_read)
+			goto unlock;
+		saw_one = true;
+
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
+		if (ret <= 0)
+			goto unlock;
+
+		/*
+		 * Similar to percpu_rwsem_wake_function(), we need to guard
+		 * against the wakee noticing w->lock_acquired, returning, and
+		 * then exiting before we do the wakeup:
+		 */
+		task = get_task_struct(w->task);
+		__list_del(w->list.prev, w->list.next);
+		/*
+		 * The release barrier here ensures the ordering of the
+		 * __list_del before setting w->lock_acquired; @w is on the
+		 * stack of the thread doing the waiting and will be reused
+		 * after it sees w->lock_acquired with no other locking:
+		 * pairs with smp_load_acquire() in six_lock_slowpath()
+		 */
+		smp_store_release(&w->lock_acquired, true);
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (ret < 0) {
+		lock_type = -ret - 1;
+		goto again;
+	}
+}
+
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+			    enum six_lock_type lock_type)
+{
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+		return;
+
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+		return;
+
+	__six_lock_wakeup(lock, lock_type);
+}
+
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
+{
+	int ret;
+
+	ret = __do_six_trylock(lock, type, current, try);
+	if (ret < 0)
+		__six_lock_wakeup(lock, -ret - 1);
+
+	return ret > 0;
+}
+
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	if (!do_six_trylock(lock, type, true))
+		return false;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
+{
+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+		return false;
+
+	if (six_lock_seq(lock) != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
+{
+	struct task_struct *owner;
+	bool ret;
+
+	if (need_resched())
+		return false;
+
+	rcu_read_lock();
+	owner = READ_ONCE(lock->owner);
+	ret = !owner || owner_on_cpu(owner);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool six_spin_on_owner(struct six_lock *lock,
+				     struct task_struct *owner,
+				     u64 end_time)
+{
+	bool ret = true;
+	unsigned loop = 0;
+
+	rcu_read_lock();
+	while (lock->owner == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner_on_cpu(owner) || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
+			ret = false;
+			break;
+		}
+
+		cpu_relax();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	struct task_struct *task = current;
+	u64 end_time;
+
+	if (type == SIX_LOCK_write)
+		return false;
+
+	preempt_disable();
+	if (!six_can_spin_on_owner(lock))
+		goto fail;
+
+	if (!osq_lock(&lock->osq))
+		goto fail;
+
+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
+	while (1) {
+		struct task_struct *owner;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = READ_ONCE(lock->owner);
+		if (owner && !six_spin_on_owner(lock, owner, end_time))
+			break;
+
+		if (do_six_trylock(lock, type, false)) {
+			osq_unlock(&lock->osq);
+			preempt_enable();
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax();
+	}
+
+	osq_unlock(&lock->osq);
+fail:
+	preempt_enable();
+
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock again. This avoids getting
+	 * scheduled out right after we obtained the lock.
+	 */
+	if (need_resched())
+		schedule();
+
+	return false;
+}
+
+#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	return false;
+}
+
+#endif
+
+noinline
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
+{
+	int ret = 0;
+
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
+		smp_mb__after_atomic();
+	}
+
+	trace_contention_begin(lock, 0);
+	lock_contended(&lock->dep_map, ip);
+
+	if (six_optimistic_spin(lock, type))
+		goto out;
+
+	wait->task		= current;
+	wait->lock_want		= type;
+	wait->lock_acquired	= false;
+
+	raw_spin_lock(&lock->wait_lock);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+	/*
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
+	 */
+	ret = __do_six_trylock(lock, type, current, false);
+	if (ret <= 0) {
+		wait->start_time = local_clock();
+
+		if (!list_empty(&lock->wait_list)) {
+			struct six_lock_waiter *last =
+				list_last_entry(&lock->wait_list,
+					struct six_lock_waiter, list);
+
+			if (time_before_eq64(wait->start_time, last->start_time))
+				wait->start_time = last->start_time + 1;
+		}
+
+		list_add_tail(&wait->list, &lock->wait_list);
+	}
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (unlikely(ret > 0)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (unlikely(ret < 0)) {
+		__six_lock_wakeup(lock, -ret - 1);
+		ret = 0;
+	}
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
+		 */
+		if (smp_load_acquire(&wait->lock_acquired))
+			break;
+
+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+		if (unlikely(ret)) {
+			bool acquired;
+
+			/*
+			 * If should_sleep_fn() returns an error, we are
+			 * required to return that error even if we already
+			 * acquired the lock - should_sleep_fn() might have
+			 * modified external state (e.g. when the deadlock cycle
+			 * detector in bcachefs issued a transaction restart)
+			 */
+			raw_spin_lock(&lock->wait_lock);
+			acquired = wait->lock_acquired;
+			if (!acquired)
+				list_del(&wait->list);
+			raw_spin_unlock(&lock->wait_lock);
+
+			if (unlikely(acquired))
+				do_six_unlock_type(lock, type);
+			break;
+		}
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+out:
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+	}
+	trace_contention_end(lock, 0);
+
+	return ret;
+}
+
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
+{
+	int ret;
+
+	wait->start_time = 0;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
+
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+
+	if (ret && type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	if (!ret)
+		lock_acquired(&lock->dep_map, ip);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
+
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	u32 state;
+
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
+		state = atomic_read(&lock->state);
+	} else {
+		u32 v = l[type].lock_val;
+
+		if (type != SIX_LOCK_read)
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
+
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
+	}
+
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	EBUG_ON(type == SIX_LOCK_write &&
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+	EBUG_ON((type == SIX_LOCK_write ||
+		 type == SIX_LOCK_intent) &&
+		lock->owner != current);
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	else
+		lock->seq++;
+
+	if (type == SIX_LOCK_intent &&
+	    lock->intent_lock_recurse) {
+		--lock->intent_lock_recurse;
+		return;
+	}
+
+	do_six_unlock_type(lock, type);
+}
+EXPORT_SYMBOL_GPL(six_unlock_ip);
+
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
+void six_lock_downgrade(struct six_lock *lock)
+{
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+	u32 old = atomic_read(&lock->state), new;
+
+	do {
+		new = old;
+
+		if (new & SIX_LOCK_HELD_intent)
+			return false;
+
+		if (!lock->readers) {
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new -= l[SIX_LOCK_read].lock_val;
+		}
+
+		new |= SIX_LOCK_HELD_intent;
+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
+
+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
+
+	six_set_owner(lock, SIX_LOCK_intent, old, current);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+	if (to == from)
+		return true;
+
+	if (to == SIX_LOCK_read) {
+		six_lock_downgrade(lock);
+		return true;
+	} else {
+		return six_lock_tryupgrade(lock);
+	}
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
+
+	/* XXX: assert already locked, and that we don't overflow: */
+
+	switch (type) {
+	case SIX_LOCK_read:
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!(atomic_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic_add(l[type].lock_val, &lock->state);
+		}
+		break;
+	case SIX_LOCK_intent:
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+		lock->intent_lock_recurse++;
+		break;
+	case SIX_LOCK_write:
+		BUG();
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+	u32 state = atomic_read(&lock->state);
+	struct six_lock_waiter *w;
+
+	six_lock_wakeup(lock, state, SIX_LOCK_read);
+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
+	six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+	raw_spin_lock(&lock->wait_lock);
+	list_for_each_entry(w, &lock->wait_list, list)
+		wake_up_process(w->task);
+	raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+	struct six_lock_count ret;
+
+	ret.n[SIX_LOCK_read]	= !lock->readers
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+		: pcpu_read_count(lock);
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers) {
+		this_cpu_add(*lock->readers, nr);
+	} else {
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+		/* reader count starts at bit 0 */
+		atomic_add(nr, &lock->state);
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic_set(&lock->state, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	/*
+	 * Don't assume that we have real percpu variables available in
+	 * userspace:
+	 */
+#ifdef __KERNEL__
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+#endif
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
new file mode 100644
index 000000000000..4c268b0b8316
--- /dev/null
+++ b/fs/bcachefs/six.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/**
+ * DOC: SIX locks overview
+ *
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
+ *
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
+ *
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
+ *
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
+ *
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade()	convert from intent to read
+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#include <linux/osq_lock.h>
+#endif
+
+enum six_lock_type {
+	SIX_LOCK_read,
+	SIX_LOCK_intent,
+	SIX_LOCK_write,
+};
+
+struct six_lock {
+	atomic_t		state;
+	u32			seq;
+	unsigned		intent_lock_recurse;
+	struct task_struct	*owner;
+	unsigned __percpu	*readers;
+#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq;
+#endif
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
+};
+
+struct six_lock_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+	enum six_lock_type	lock_want;
+	bool			lock_acquired;
+	u64			start_time;
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+	SIX_LOCK_INIT_PCPU	= 1U << 0,
+};
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:	lock to initialize
+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)					\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__six_lock_init((lock), #lock, &__key, flags);			\
+} while (0)
+
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:	six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+	return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+				  struct six_lock_waiter *wait,
+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
+			      unsigned long ip)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
+{
+	return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	six_unlock_ip(lock, type, _THIS_IP_);
+}
+
+#define __SIX_LOCK(type)						\
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
+}									\
+									\
+static inline bool six_trylock_##type(struct six_lock *lock)		\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
+{									\
+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline int six_lock_ip_##type(struct six_lock *lock,		\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
+{									\
+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{									\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
+}									\
+									\
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
+{									\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_##type(struct six_lock *lock,		\
+				  six_lock_should_sleep_fn fn, void *p)\
+{									\
+	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
+}									\
+									\
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
+}									\
+									\
+static inline void six_unlock_##type(struct six_lock *lock)		\
+{									\
+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+			 enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+struct six_lock_count {
+	unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
+
+#endif /* _LINUX_SIX_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
new file mode 100644
index 000000000000..5dac038f0851
--- /dev/null
+++ b/fs/bcachefs/snapshot.c
@@ -0,0 +1,1713 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+	prt_printf(out, "subvol %u root snapshot %u",
+		   le32_to_cpu(t.v->master_subvol),
+		   le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
+			       enum bkey_invalid_flags flags,
+			       struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+			 bkey_lt(k.k->p, POS(0, 1)), c, err,
+			 snapshot_tree_pos_bad,
+			 "bad pos");
+fsck_err:
+	return ret;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+			      struct bch_snapshot_tree *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = -BCH_ERR_ENOENT_snapshot_tree;
+	return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+	struct bkey_i_snapshot_tree *s_t;
+
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
+	if (ret)
+		return ERR_PTR(ret);
+
+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(s_t);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+				u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+	struct bkey_i_snapshot_tree *n_tree =
+		__bch2_snapshot_tree_create(trans);
+
+	if (IS_ERR(n_tree))
+		return PTR_ERR(n_tree);
+
+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
+	*tree_id = n_tree->k.p.offset;
+	return 0;
+}
+
+/* Snapshot nodes: */
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor)
+		id = __snapshot_t(t, id)->parent;
+	rcu_read_unlock();
+
+	return id == ancestor;
+}
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+
+	if (s->skip[2] <= ancestor)
+		return s->skip[2];
+	if (s->skip[1] <= ancestor)
+		return s->skip[1];
+	if (s->skip[0] <= ancestor)
+		return s->skip[0];
+	return s->parent;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	struct snapshot_table *t;
+	bool ret;
+
+	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+	rcu_read_lock();
+	t = rcu_dereference(c->snapshots);
+
+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+		id = get_ancestor_below(t, id, ancestor);
+
+	if (id && id < ancestor) {
+		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+
+		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+	} else {
+		ret = id == ancestor;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+	size_t new_size;
+	struct snapshot_table *new, *old;
+
+	new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = rcu_dereference_protected(c->snapshots, true);
+	if (old)
+		memcpy(new->s,
+		       rcu_dereference_protected(c->snapshots, true)->s,
+		       sizeof(new->s[0]) * c->snapshot_table_size);
+
+	rcu_assign_pointer(c->snapshots, new);
+	c->snapshot_table_size = new_size;
+	kvfree_rcu_mightsleep(old);
+
+	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+	size_t idx = U32_MAX - id;
+
+	lockdep_assert_held(&c->snapshot_table_lock);
+
+	if (likely(idx < c->snapshot_table_size))
+		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+	return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+	       BCH_SNAPSHOT_SUBVOL(s.v),
+	       BCH_SNAPSHOT_DELETED(s.v),
+	       le32_to_cpu(s.v->parent),
+	       le32_to_cpu(s.v->children[0]),
+	       le32_to_cpu(s.v->children[1]),
+	       le32_to_cpu(s.v->subvol),
+	       le32_to_cpu(s.v->tree));
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+		prt_printf(out, " depth %u skiplist %u %u %u",
+			   le32_to_cpu(s.v->depth),
+			   le32_to_cpu(s.v->skip[0]),
+			   le32_to_cpu(s.v->skip[1]),
+			   le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
+			  enum bkey_invalid_flags flags,
+			  struct printbuf *err)
+{
+	struct bkey_s_c_snapshot s;
+	u32 i, id;
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+			 bkey_lt(k.k->p, POS(0, 1)), c, err,
+			 snapshot_pos_bad,
+			 "bad pos");
+
+	s = bkey_s_c_to_snapshot(k);
+
+	id = le32_to_cpu(s.v->parent);
+	bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
+			 snapshot_parent_bad,
+			 "bad parent node (%u <= %llu)",
+			 id, k.k->p.offset);
+
+	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
+			 snapshot_children_not_normalized,
+			 "children not normalized");
+
+	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
+			 snapshot_child_duplicate,
+			 "duplicate child nodes");
+
+	for (i = 0; i < 2; i++) {
+		id = le32_to_cpu(s.v->children[i]);
+
+		bkey_fsck_err_on(id >= k.k->p.offset, c, err,
+				 snapshot_child_bad,
+				 "bad child node (%u >= %llu)",
+				 id, k.k->p.offset);
+	}
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
+				 snapshot_skiplist_not_normalized,
+				 "skiplist not normalized");
+
+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+			id = le32_to_cpu(s.v->skip[i]);
+
+			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
+					 snapshot_skiplist_bad,
+					 "bad skiplist node %u", id);
+		}
+	}
+fsck_err:
+	return ret;
+}
+
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	struct snapshot_t *t = snapshot_t_mut(c, id);
+	u32 parent = id;
+
+	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+	       parent - id - 1 < IS_ANCESTOR_BITMAP)
+		__set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+	mutex_lock(&c->snapshot_table_lock);
+	__set_is_ancestor_bitmap(c, id);
+	mutex_unlock(&c->snapshot_table_lock);
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+		       enum btree_id btree, unsigned level,
+		       struct bkey_s_c old, struct bkey_s_c new,
+		       unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_t *t;
+	u32 id = new.k->p.offset;
+	int ret = 0;
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	t = snapshot_t_mut(c, id);
+	if (!t) {
+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
+		goto err;
+	}
+
+	if (new.k->type == KEY_TYPE_snapshot) {
+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+		t->parent	= le32_to_cpu(s.v->parent);
+		t->children[0]	= le32_to_cpu(s.v->children[0]);
+		t->children[1]	= le32_to_cpu(s.v->children[1]);
+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+		t->tree		= le32_to_cpu(s.v->tree);
+
+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+			t->depth	= le32_to_cpu(s.v->depth);
+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
+		} else {
+			t->depth	= 0;
+			t->skip[0]	= 0;
+			t->skip[1]	= 0;
+			t->skip[2]	= 0;
+		}
+
+		__set_is_ancestor_bitmap(c, id);
+
+		if (BCH_SNAPSHOT_DELETED(s.v)) {
+			set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+				bch2_delete_dead_snapshots_async(c);
+		}
+	} else {
+		memset(t, 0, sizeof(*t));
+	}
+err:
+	mutex_unlock(&c->snapshot_table_lock);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s)
+{
+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+				       BTREE_ITER_WITH_UPDATES, snapshot, s);
+}
+
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+	struct bch_snapshot v;
+	int ret;
+
+	if (!id)
+		return 0;
+
+	ret = bch2_snapshot_lookup(trans, id, &v);
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(trans->c, "snapshot node %u not found", id);
+	if (ret)
+		return ret;
+
+	return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	unsigned i, nr_live = 0, live_idx = 0;
+	struct bkey_s_c_snapshot snap;
+	u32 id = k.k->p.offset, child[2];
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+
+	child[0] = le32_to_cpu(snap.v->children[0]);
+	child[1] = le32_to_cpu(snap.v->children[1]);
+
+	for (i = 0; i < 2; i++) {
+		int ret = bch2_snapshot_live(trans, child[i]);
+
+		if (ret < 0)
+			return ret;
+
+		if (ret)
+			live_idx = i;
+		nr_live += ret;
+	}
+
+	mutex_lock(&c->snapshot_table_lock);
+
+	snapshot_t_mut(c, id)->equiv = nr_live == 1
+		? snapshot_t_mut(c, child[live_idx])->equiv
+		: id;
+
+	mutex_unlock(&c->snapshot_table_lock);
+
+	return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+	return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+	return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+	u32 n, parent;
+
+	n = bch2_snapshot_left_child(c, id);
+	if (n)
+		return n;
+
+	while ((parent = bch2_snapshot_parent(c, id))) {
+		n = bch2_snapshot_right_child(c, parent);
+		if (n && n != id)
+			return n;
+		id = parent;
+	}
+
+	return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+	u32 id = snapshot_root;
+	u32 subvol = 0, s;
+
+	while (id) {
+		s = snapshot_t(c, id)->subvol;
+
+		if (s && (!subvol || s < subvol))
+			subvol = s;
+
+		id = bch2_snapshot_tree_next(c, id);
+	}
+
+	return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+					    u32 snapshot_root, u32 *subvol_id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_subvolume s;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+				     0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		s = bkey_s_c_to_subvolume(k);
+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+			continue;
+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
+			*subvol_id = s.k->p.offset;
+			found = true;
+			break;
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!ret && !found) {
+		struct bkey_i_subvolume *u;
+
+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+		u = bch2_bkey_get_mut_typed(trans, &iter,
+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
+					    0, subvolume);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			return ret;
+
+		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
+	}
+
+	return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot_tree st;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct printbuf buf = PRINTBUF;
+	u32 root_id;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot_tree)
+		return 0;
+
+	st = bkey_s_c_to_snapshot_tree(k);
+	root_id = le32_to_cpu(st.v->root_snapshot);
+
+	ret = bch2_snapshot_lookup(trans, root_id, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret ||
+			root_id != bch2_snapshot_root(c, root_id) ||
+			st.k->p.offset != le32_to_cpu(s.tree),
+			c, snapshot_tree_to_missing_snapshot,
+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		ret = bch2_btree_delete_at(trans, iter, 0);
+		goto err;
+	}
+
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+				 false, 0, &subvol);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (fsck_err_on(ret,
+			c, snapshot_tree_to_missing_subvol,
+			"snapshot tree points to missing subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+						le32_to_cpu(subvol.snapshot),
+						root_id),
+			c, snapshot_tree_to_wrong_subvol,
+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+			c, snapshot_tree_to_snapshot_subvol,
+			"snapshot tree points to snapshot subvolume:\n  %s",
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+		struct bkey_i_snapshot_tree *u;
+		u32 subvol_id;
+
+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		if (ret)
+			goto err;
+
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.master_subvol = cpu_to_le32(subvol_id);
+		st = snapshot_tree_i_to_s_c(u);
+	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_snapshot_trees, POS_MIN,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot_tree(trans, &iter, k)));
+
+	if (ret)
+		bch_err(c, "error %i checking snapshot trees", ret);
+	return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+				  u32 snap_id, u32 tree_id)
+{
+	struct bch_snapshot_tree s_t;
+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+	if (bch2_err_matches(ret, ENOENT))
+		return 0;
+	if (ret)
+		return ret;
+
+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+
+	if (!id)
+		return 0;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	if (s->parent)
+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
+{
+	unsigned i;
+
+	for (i = 0; i < 3; i++)
+		if (!s.parent) {
+			if (s.skip[i])
+				return false;
+		} else {
+			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+				return false;
+		}
+
+	return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+				    struct btree_iter *iter,
+				    struct bkey_s_c k,
+				    struct bch_snapshot *s)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter root_iter;
+	struct bch_snapshot_tree s_t;
+	struct bkey_s_c_snapshot root;
+	struct bkey_i_snapshot *u;
+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+	int ret;
+
+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
+			       BTREE_ID_snapshots, POS(0, root_id),
+			       BTREE_ITER_WITH_UPDATES, snapshot);
+	ret = bkey_err(root);
+	if (ret)
+		goto err;
+
+	tree_id = le32_to_cpu(root.v->tree);
+
+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+		ret =   PTR_ERR_OR_ZERO(u) ?:
+			bch2_snapshot_tree_create(trans, root_id,
+				bch2_snapshot_tree_oldest_subvol(c, root_id),
+				&tree_id);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		if (k.k->p.offset == root_id)
+			*s = u->v;
+	}
+
+	if (k.k->p.offset != root_id) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.tree = cpu_to_le32(tree_id);
+		*s = u->v;
+	}
+err:
+	bch2_trans_iter_exit(trans, &root_iter);
+	return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+			  struct btree_iter *iter,
+			  struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_snapshot s;
+	struct bch_subvolume subvol;
+	struct bch_snapshot v;
+	struct bkey_i_snapshot *u;
+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+	u32 real_depth;
+	struct printbuf buf = PRINTBUF;
+	bool should_have_subvol;
+	u32 i, id;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	memset(&s, 0, sizeof(s));
+	memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+	id = le32_to_cpu(s.parent);
+	if (id) {
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
+				id, k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	for (i = 0; i < 2 && s.children[i]; i++) {
+		id = le32_to_cpu(s.children[i]);
+
+		ret = bch2_snapshot_lookup(trans, id, &v);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot node %llu has nonexistent child %u",
+				k.k->p.offset, id);
+		if (ret)
+			goto err;
+
+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+				id, le32_to_cpu(v.parent), k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+		!BCH_SNAPSHOT_DELETED(&s);
+
+	if (should_have_subvol) {
+		id = le32_to_cpu(s.subvol);
+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		if (ret)
+			goto err;
+
+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+				k.k->p.offset);
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
+		if (fsck_err_on(s.subvol,
+				c, snapshot_should_not_have_subvol,
+				"snapshot should not point to subvol:\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+			ret = PTR_ERR_OR_ZERO(u);
+			if (ret)
+				goto err;
+
+			u->v.subvol = 0;
+			s = u->v;
+		}
+	}
+
+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+			"snapshot points to missing/incorrect tree:\n  %s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+		if (ret)
+			goto err;
+	}
+	ret = 0;
+
+	real_depth = bch2_snapshot_depth(c, parent_id);
+
+	if (le32_to_cpu(s.depth) != real_depth &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, snapshot_bad_depth,
+		      "snapshot with incorrect depth field, should be %u:\n  %s",
+		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		u->v.depth = cpu_to_le32(real_depth);
+		s = u->v;
+	}
+
+	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
+	if (ret < 0)
+		goto err;
+
+	if (!ret &&
+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+	     fsck_err(c, snapshot_bad_skiplist,
+		      "snapshot with bad skiplist field:\n  %s",
+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(u);
+		if (ret)
+			goto err;
+
+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+		s = u->v;
+	}
+	ret = 0;
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	/*
+	 * We iterate backwards as checking/fixing the depth field requires that
+	 * the parent's depth already be correct:
+	 */
+	ret = bch2_trans_run(c,
+		for_each_btree_key_reverse_commit(trans, iter,
+			BTREE_ID_snapshots, POS_MAX,
+			BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_snapshot(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s;
+	int ret = 0;
+
+	s = bch2_bkey_get_mut_typed(trans, &iter,
+				    BTREE_ID_snapshots, POS(0, id),
+				    0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+					trans->c, "missing snapshot %u", id);
+		return ret;
+	}
+
+	/* already deleted? */
+	if (BCH_SNAPSHOT_DELETED(&s->v))
+		goto err;
+
+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+	s->v.subvol = 0;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+		swap(s->children[0], s->children[1]);
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+	struct btree_iter c_iter = (struct btree_iter) { NULL };
+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
+	struct bkey_s_c_snapshot s;
+	u32 parent_id, child_id;
+	unsigned i;
+	int ret = 0;
+
+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+				     BTREE_ITER_INTENT, snapshot);
+	ret = bkey_err(s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"missing snapshot %u", id);
+
+	if (ret)
+		goto err;
+
+	BUG_ON(s.v->children[1]);
+
+	parent_id = le32_to_cpu(s.v->parent);
+	child_id = le32_to_cpu(s.v->children[0]);
+
+	if (parent_id) {
+		struct bkey_i_snapshot *parent;
+
+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+				     BTREE_ID_snapshots, POS(0, parent_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(parent);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", parent_id);
+		if (unlikely(ret))
+			goto err;
+
+		/* find entry in parent->children for node being deleted */
+		for (i = 0; i < 2; i++)
+			if (le32_to_cpu(parent->v.children[i]) == id)
+				break;
+
+		if (bch2_fs_inconsistent_on(i == 2, c,
+					"snapshot %u missing child pointer to %u",
+					parent_id, id))
+			goto err;
+
+		parent->v.children[i] = cpu_to_le32(child_id);
+
+		normalize_snapshot_child_pointers(&parent->v);
+	}
+
+	if (child_id) {
+		struct bkey_i_snapshot *child;
+
+		child = bch2_bkey_get_mut_typed(trans, &c_iter,
+				     BTREE_ID_snapshots, POS(0, child_id),
+				     0, snapshot);
+		ret = PTR_ERR_OR_ZERO(child);
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+					"missing snapshot %u", child_id);
+		if (unlikely(ret))
+			goto err;
+
+		child->v.parent = cpu_to_le32(parent_id);
+
+		if (!child->v.parent) {
+			child->v.skip[0] = 0;
+			child->v.skip[1] = 0;
+			child->v.skip[2] = 0;
+		}
+	}
+
+	if (!parent_id) {
+		/*
+		 * We're deleting the root of a snapshot tree: update the
+		 * snapshot_tree entry to point to the new root, or delete it if
+		 * this is the last snapshot ID in this tree:
+		 */
+		struct bkey_i_snapshot_tree *s_t;
+
+		BUG_ON(s.v->children[1]);
+
+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+				0, snapshot_tree);
+		ret = PTR_ERR_OR_ZERO(s_t);
+		if (ret)
+			goto err;
+
+		if (s.v->children[0]) {
+			s_t->v.root_snapshot = s.v->children[0];
+		} else {
+			s_t->k.type = KEY_TYPE_deleted;
+			set_bkey_val_u64s(&s_t->k, 0);
+		}
+	}
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &tree_iter);
+	bch2_trans_iter_exit(trans, &p_iter);
+	bch2_trans_iter_exit(trans, &c_iter);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+			  u32 *new_snapids,
+			  u32 *snapshot_subvols,
+			  unsigned nr_snapids)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n;
+	struct bkey_s_c k;
+	unsigned i, j;
+	u32 depth = bch2_snapshot_depth(c, parent);
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+			     POS_MIN, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < nr_snapids; i++) {
+		k = bch2_btree_iter_prev_slot(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k || !k.k->p.offset) {
+			ret = -BCH_ERR_ENOSPC_snapshot_create;
+			goto err;
+		}
+
+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.flags	= 0;
+		n->v.parent	= cpu_to_le32(parent);
+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
+		n->v.tree	= cpu_to_le32(tree);
+		n->v.depth	= cpu_to_le32(depth);
+
+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+		if (ret)
+			goto err;
+
+		new_snapids[i]	= iter.pos.offset;
+
+		mutex_lock(&c->snapshot_table_lock);
+		snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+		mutex_unlock(&c->snapshot_table_lock);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct btree_iter iter;
+	struct bkey_i_snapshot *n_parent;
+	int ret = 0;
+
+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, parent),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(n_parent);
+	if (unlikely(ret)) {
+		if (bch2_err_matches(ret, ENOENT))
+			bch_err(trans->c, "snapshot %u not found", parent);
+		return ret;
+	}
+
+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		goto err;
+
+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+	n_parent->v.subvol = 0;
+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	struct bkey_i_snapshot_tree *n_tree;
+	int ret;
+
+	n_tree = __bch2_snapshot_tree_create(trans);
+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+		create_snapids(trans, 0, n_tree->k.p.offset,
+			     new_snapids, snapshot_subvols, nr_snapids);
+	if (ret)
+		return ret;
+
+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
+	return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+			      u32 *new_snapids,
+			      u32 *snapshot_subvols,
+			      unsigned nr_snapids)
+{
+	BUG_ON((parent == 0) != (nr_snapids == 1));
+	BUG_ON((parent != 0) != (nr_snapids == 2));
+
+	return parent
+		? bch2_snapshot_node_create_children(trans, parent,
+				new_snapids, snapshot_subvols, nr_snapids)
+		: bch2_snapshot_node_create_tree(trans,
+				new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
+ */
+
+static int snapshot_delete_key(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k,
+			       snapshot_id_list *deleted,
+			       snapshot_id_list *equiv_seen,
+			       struct bpos *last_pos)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	if (!bkey_eq(k.k->p, *last_pos))
+		equiv_seen->nr = 0;
+	*last_pos = k.k->p;
+
+	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+	    snapshot_list_has_id(equiv_seen, equiv)) {
+		return bch2_btree_delete_at(trans, iter,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	} else {
+		return snapshot_list_add(c, equiv_seen, equiv);
+	}
+}
+
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+	/*
+	 * When we have a linear chain of snapshot nodes, we consider
+	 * those to form an equivalence class: we're going to collapse
+	 * them all down to a single node, and keep the leaf-most node -
+	 * which has the same id as the equivalence class id.
+	 *
+	 * If there are multiple keys in different snapshots at the same
+	 * position, we're only going to keep the one in the newest
+	 * snapshot - the rest have been overwritten and are redundant,
+	 * and for the key we're going to keep we need to move it to the
+	 * equivalance class ID if it's not there already.
+	 */
+	if (equiv != k.k->p.snapshot) {
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+		struct btree_iter new_iter;
+		int ret;
+
+		ret = PTR_ERR_OR_ZERO(new);
+		if (ret)
+			return ret;
+
+		new->k.p.snapshot = equiv;
+
+		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+				     BTREE_ITER_ALL_SNAPSHOTS|
+				     BTREE_ITER_CACHED|
+				     BTREE_ITER_INTENT);
+
+		ret =   bch2_btree_iter_traverse(&new_iter) ?:
+			bch2_trans_update(trans, &new_iter, new,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+			bch2_btree_delete_at(trans, iter,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		bch2_trans_iter_exit(trans, &new_iter);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bkey_s_c_snapshot snap;
+	u32 children[2];
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    BCH_SNAPSHOT_SUBVOL(snap.v))
+		return 0;
+
+	children[0] = le32_to_cpu(snap.v->children[0]);
+	children[1] = le32_to_cpu(snap.v->children[1]);
+
+	ret   = bch2_snapshot_live(trans, children[0]) ?:
+		bch2_snapshot_live(trans, children[1]);
+	if (ret < 0)
+		return ret;
+	return !ret;
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+	int ret = bch2_snapshot_needs_delete(trans, k);
+
+	return ret <= 0
+		? ret
+		: bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+}
+
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+						snapshot_id_list *skip)
+{
+	rcu_read_lock();
+	while (snapshot_list_has_id(skip, id))
+		id = __bch2_snapshot_parent(c, id);
+
+	while (n--) {
+		do {
+			id = __bch2_snapshot_parent(c, id);
+		} while (snapshot_list_has_id(skip, id));
+	}
+	rcu_read_unlock();
+
+	return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+					      struct btree_iter *iter, struct bkey_s_c k,
+					      snapshot_id_list *deleted)
+{
+	struct bch_fs *c = trans->c;
+	u32 nr_deleted_ancestors = 0;
+	struct bkey_i_snapshot *s;
+	u32 *i;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	if (snapshot_list_has_id(deleted, k.k->p.offset))
+		return 0;
+
+	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	darray_for_each(*deleted, i)
+		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+	if (!nr_deleted_ancestors)
+		return 0;
+
+	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+	if (!s->v.depth) {
+		s->v.skip[0] = 0;
+		s->v.skip[1] = 0;
+		s->v.skip[2] = 0;
+	} else {
+		u32 depth = le32_to_cpu(s->v.depth);
+		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+			u32 id = le32_to_cpu(s->v.skip[j]);
+
+			if (snapshot_list_has_id(deleted, id)) {
+				id = bch2_snapshot_nth_parent_skip(c,
+							parent,
+							depth > 1
+							? get_random_u32_below(depth - 1)
+							: 0,
+							deleted);
+				s->v.skip[j] = cpu_to_le32(id);
+			}
+		}
+
+		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+	}
+
+	return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_snapshot snap;
+	snapshot_id_list deleted = { 0 };
+	snapshot_id_list deleted_interior = { 0 };
+	u32 *i, id;
+	int ret = 0;
+
+	if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+		return 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+		ret = bch2_fs_read_write_early(c);
+		if (ret) {
+			bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+			return ret;
+		}
+	}
+
+	trans = bch2_trans_get(c);
+
+	/*
+	 * For every snapshot node: If we have no live children and it's not
+	 * pointed to by a subvolume, delete it:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
+			POS_MIN, 0, k,
+			NULL, NULL, 0,
+		bch2_delete_redundant_snapshot(trans, k));
+	if (ret) {
+		bch_err_msg(c, ret, "deleting redundant snapshots");
+		goto err;
+	}
+
+	ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+				  POS_MIN, 0, k,
+		bch2_snapshot_set_equiv(trans, k));
+	if (ret) {
+		bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+		goto err;
+	}
+
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_snapshot)
+			continue;
+
+		snap = bkey_s_c_to_snapshot(k);
+		if (BCH_SNAPSHOT_DELETED(snap.v)) {
+			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
+			if (ret)
+				break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret) {
+		bch_err_msg(c, ret, "walking snapshots");
+		goto err;
+	}
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		struct bpos last_pos = POS_MIN;
+		snapshot_id_list equiv_seen = { 0 };
+		struct disk_reservation res = { 0 };
+
+		if (!btree_type_has_snapshots(id))
+			continue;
+
+		/*
+		 * deleted inodes btree is maintained by a trigger on the inodes
+		 * btree - no work for us to do here, and it's not safe to scan
+		 * it because we'll see out of date keys due to the btree write
+		 * buffer:
+		 */
+		if (id == BTREE_ID_deleted_inodes)
+			continue;
+
+		ret = for_each_btree_key_commit(trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+		      for_each_btree_key_commit(trans, iter,
+				id, POS_MIN,
+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+				&res, NULL, BTREE_INSERT_NOFAIL,
+			move_key_to_correct_snapshot(trans, &iter, k));
+
+		bch2_disk_reservation_put(c, &res);
+		darray_exit(&equiv_seen);
+
+		if (ret) {
+			bch_err_msg(c, ret, "deleting keys from dying snapshots");
+			goto err;
+		}
+	}
+
+	bch2_trans_unlock(trans);
+	down_write(&c->snapshot_create_lock);
+
+	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k, ret) {
+		u32 snapshot = k.k->p.offset;
+		u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+		if (equiv != snapshot)
+			snapshot_list_add(c, &deleted_interior, snapshot);
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		goto err_create_lock;
+
+	/*
+	 * Fixing children of deleted snapshots can't be done completely
+	 * atomically, if we crash between here and when we delete the interior
+	 * nodes some depth fields will be off:
+	 */
+	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
+				  BTREE_ITER_INTENT, k,
+				  NULL, NULL, BTREE_INSERT_NOFAIL,
+		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+	if (ret)
+		goto err_create_lock;
+
+	darray_for_each(deleted, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			goto err_create_lock;
+		}
+	}
+
+	darray_for_each(deleted_interior, i) {
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_snapshot_node_delete(trans, *i));
+		if (ret) {
+			bch_err_msg(c, ret, "deleting snapshot %u", *i);
+			goto err_create_lock;
+		}
+	}
+err_create_lock:
+	up_write(&c->snapshot_create_lock);
+err:
+	darray_exit(&deleted_interior);
+	darray_exit(&deleted);
+	bch2_trans_put(trans);
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+	bch2_delete_dead_snapshots(c);
+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+				       enum btree_id id,
+				       struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_iter_init(trans, &iter, id, pos,
+			     BTREE_ITER_NOT_EXTENTS|
+			     BTREE_ITER_ALL_SNAPSHOTS);
+	while (1) {
+		k = bch2_btree_iter_prev(&iter);
+		ret = bkey_err(k);
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		if (!bkey_eq(pos, k.k->p))
+			break;
+
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+			ret = 1;
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s = snapshot_t(c, id);
+
+	return s->children[1] ?: s->children[0];
+}
+
+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
+{
+	u32 child;
+
+	while ((child = bch2_snapshot_smallest_child(c, id)))
+		id = child;
+	return id;
+}
+
+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
+					       enum btree_id btree,
+					       struct bkey_s_c interior_k,
+					       u32 leaf_id, struct bpos *new_min_pos)
+{
+	struct btree_iter iter;
+	struct bpos pos = interior_k.k->p;
+	struct bkey_s_c k;
+	struct bkey_i *new;
+	int ret;
+
+	pos.snapshot = leaf_id;
+
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto out;
+
+	/* key already overwritten in this snapshot? */
+	if (k.k->p.snapshot != interior_k.k->p.snapshot)
+		goto out;
+
+	if (bpos_eq(*new_min_pos, POS_MIN)) {
+		*new_min_pos = k.k->p;
+		new_min_pos->snapshot = leaf_id;
+	}
+
+	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
+	ret = PTR_ERR_OR_ZERO(new);
+	if (ret)
+		goto out;
+
+	new->k.p.snapshot = leaf_id;
+	ret = bch2_trans_update(trans, &iter, new, 0);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
+					  enum btree_id btree,
+					  struct bkey_s_c k,
+					  struct bpos *new_min_pos)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf sk;
+	u32 restart_count = trans->restart_count;
+	int ret = 0;
+
+	bch2_bkey_buf_init(&sk);
+	bch2_bkey_buf_reassemble(&sk, c, k);
+	k = bkey_i_to_s_c(sk.k);
+
+	*new_min_pos = POS_MIN;
+
+	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
+	     id < k.k->p.snapshot;
+	     id++) {
+		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
+		    !bch2_snapshot_is_leaf(c, id))
+			continue;
+again:
+		ret =   btree_trans_too_many_iters(trans) ?:
+			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
+		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			bch2_trans_begin(trans);
+			goto again;
+		}
+
+		if (ret)
+			break;
+	}
+
+	bch2_bkey_buf_exit(&sk, c);
+
+	return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_snapshot snap;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_snapshot)
+		return 0;
+
+	snap = bkey_s_c_to_snapshot(k);
+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
+	    bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+	    (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+		set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+		return 0;
+	}
+
+	return ret;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+			bch2_snapshot_set_equiv(trans, k) ?:
+			bch2_check_snapshot_needs_deletion(trans, k)) ?:
+		for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+			   POS_MIN, 0, k,
+			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+	kfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
new file mode 100644
index 000000000000..f09a22f44239
--- /dev/null
+++ b/fs/bcachefs/snapshot.h
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+enum bkey_invalid_flags;
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
+			       enum bkey_invalid_flags, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
+	.key_invalid	= bch2_snapshot_tree_invalid,		\
+	.val_to_text	= bch2_snapshot_tree_to_text,		\
+	.min_val_size	= 8,					\
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
+			  enum bkey_invalid_flags, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+		       struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
+	.key_invalid	= bch2_snapshot_invalid,		\
+	.val_to_text	= bch2_snapshot_to_text,		\
+	.atomic_trigger	= bch2_mark_snapshot,			\
+	.min_val_size	= 24,					\
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+	return &t->s[U32_MAX - id];
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+	return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = snapshot_t(c, id)->tree;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent_early(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	u32 parent = snapshot_t(c, id)->parent;
+
+	if (parent &&
+	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+		panic("id %u depth=%u parent %u depth=%u\n",
+		      id, snapshot_t(c, id)->depth,
+		      parent, snapshot_t(c, parent)->depth);
+
+	return parent;
+#else
+	return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+	rcu_read_lock();
+	while (n--)
+		id = __bch2_snapshot_parent(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+	u32 parent;
+
+	rcu_read_lock();
+	while ((parent = __bch2_snapshot_parent(c, id)))
+		id = parent;
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	return snapshot_t(c, id)->equiv;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+	rcu_read_lock();
+	id = __bch2_snapshot_equiv(c, id);
+	rcu_read_unlock();
+
+	return id;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+	return id == bch2_snapshot_equiv(c, id);
+}
+
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	bool ret;
+
+	rcu_read_lock();
+	s = snapshot_t(c, id);
+	ret = s->children[0];
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+	return !bch2_snapshot_is_internal_node(c, id);
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *s;
+	u32 parent = __bch2_snapshot_parent(c, id);
+
+	if (!parent)
+		return 0;
+
+	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
+	if (id == s->children[0])
+		return s->children[1];
+	if (id == s->children[1])
+		return s->children[0];
+	return 0;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+	u32 depth;
+
+	rcu_read_lock();
+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+	rcu_read_unlock();
+
+	return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+	return id == ancestor
+		? true
+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+	const struct snapshot_t *t;
+	bool ret;
+
+	rcu_read_lock();
+	t = snapshot_t(c, id);
+	ret = (t->children[0]|t->children[1]) != 0;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (*i == id)
+			return true;
+	return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	u32 *i;
+
+	darray_for_each(*s, i)
+		if (bch2_snapshot_is_ancestor(c, id, *i))
+			return true;
+	return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret;
+
+	BUG_ON(snapshot_list_has_id(s, id));
+	ret = darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+			 struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+			     struct bch_subvolume *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+			      u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+					  enum btree_id id,
+					  struct bpos pos)
+{
+	if (!btree_type_has_snapshots(id) ||
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+		return 0;
+
+	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
+					  struct bkey_s_c, struct bpos *);
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
new file mode 100644
index 000000000000..ae21a8cca1b4
--- /dev/null
+++ b/fs/bcachefs/str_hash.h
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_STR_HASH_H
+#define _BCACHEFS_STR_HASH_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "checksum.h"
+#include "error.h"
+#include "inode.h"
+#include "siphash.h"
+#include "subvolume.h"
+#include "super.h"
+
+#include <linux/crc32c.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+
+static inline enum bch_str_hash_type
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
+{
+	switch (opt) {
+	case BCH_STR_HASH_OPT_crc32c:
+		return BCH_STR_HASH_crc32c;
+	case BCH_STR_HASH_OPT_crc64:
+		return BCH_STR_HASH_crc64;
+	case BCH_STR_HASH_OPT_siphash:
+		return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
+			? BCH_STR_HASH_siphash
+			: BCH_STR_HASH_siphash_old;
+	default:
+	     BUG();
+	}
+}
+
+struct bch_hash_info {
+	u8			type;
+	/*
+	 * For crc32 or crc64 string hashes the first key value of
+	 * the siphash_key (k0) is used as the key.
+	 */
+	SIPHASH_KEY	siphash_key;
+};
+
+static inline struct bch_hash_info
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+	/* XXX ick */
+	struct bch_hash_info info = {
+		.type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
+			~(~0U << INODE_STR_HASH_BITS),
+		.siphash_key = { .k0 = bi->bi_hash_seed }
+	};
+
+	if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
+		SHASH_DESC_ON_STACK(desc, c->sha256);
+		u8 digest[SHA256_DIGEST_SIZE];
+
+		desc->tfm = c->sha256;
+
+		crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
+				    sizeof(bi->bi_hash_seed), digest);
+		memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+	}
+
+	return info;
+}
+
+struct bch_str_hash_ctx {
+	union {
+		u32		crc32c;
+		u64		crc64;
+		SIPHASH_CTX	siphash;
+	};
+};
+
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
+				     const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
+				     sizeof(info->siphash_key.k0));
+		break;
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
+				      sizeof(info->siphash_key.k0));
+		break;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		SipHash24_Init(&ctx->siphash, &info->siphash_key);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
+				       const struct bch_hash_info *info,
+				       const void *data, size_t len)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		ctx->crc32c = crc32c(ctx->crc32c, data, len);
+		break;
+	case BCH_STR_HASH_crc64:
+		ctx->crc64 = crc64_be(ctx->crc64, data, len);
+		break;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		SipHash24_Update(&ctx->siphash, data, len);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
+				   const struct bch_hash_info *info)
+{
+	switch (info->type) {
+	case BCH_STR_HASH_crc32c:
+		return ctx->crc32c;
+	case BCH_STR_HASH_crc64:
+		return ctx->crc64 >> 1;
+	case BCH_STR_HASH_siphash_old:
+	case BCH_STR_HASH_siphash:
+		return SipHash24_End(&ctx->siphash) >> 1;
+	default:
+		BUG();
+	}
+}
+
+struct bch_hash_desc {
+	enum btree_id	btree_id;
+	u8		key_type;
+
+	u64		(*hash_key)(const struct bch_hash_info *, const void *);
+	u64		(*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
+	bool		(*cmp_key)(struct bkey_s_c, const void *);
+	bool		(*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+	bool		(*is_visible)(subvol_inum inum, struct bkey_s_c);
+};
+
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+	return k.k->type == desc.key_type &&
+		(!desc.is_visible ||
+		 !inum.inum ||
+		 desc.is_visible(inum, k));
+}
+
+static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+		 struct btree_iter *iter,
+		 const struct bch_hash_desc desc,
+		 const struct bch_hash_info *info,
+		 subvol_inum inum, const void *key,
+		 unsigned flags)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|flags, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
+			if (!desc.cmp_key(k, key))
+				return 0;
+		} else if (k.k->type == KEY_TYPE_hash_whiteout) {
+			;
+		} else {
+			/* hole, not found */
+			break;
+		}
+	}
+	bch2_trans_iter_exit(trans, iter);
+
+	return ret ?: -BCH_ERR_ENOENT_str_hash_lookup;
+}
+
+static __always_inline int
+bch2_hash_hole(struct btree_trans *trans,
+	       struct btree_iter *iter,
+	       const struct bch_hash_desc desc,
+	       const struct bch_hash_info *info,
+	       subvol_inum inum, const void *key)
+{
+	struct bkey_s_c k;
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
+			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
+			   POS(inum.inum, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
+		if (!is_visible_key(desc, inum, k))
+			return 0;
+	bch2_trans_iter_exit(trans, iter);
+
+	return ret ?: -BCH_ERR_ENOSPC_str_hash_create;
+}
+
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+			     const struct bch_hash_desc desc,
+			     const struct bch_hash_info *info,
+			     struct btree_iter *start)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_trans_copy_iter(&iter, start);
+
+	bch2_btree_iter_advance(&iter);
+
+	for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
+		if (k.k->type != desc.key_type &&
+		    k.k->type != KEY_TYPE_hash_whiteout)
+			break;
+
+		if (k.k->type == desc.key_type &&
+		    desc.hash_bkey(info, k) <= start->pos.offset) {
+			ret = 1;
+			break;
+		}
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static __always_inline
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+			   const struct bch_hash_desc desc,
+			   const struct bch_hash_info *info,
+			   subvol_inum inum, u32 snapshot,
+			   struct bkey_i *insert,
+			   int flags,
+			   int update_flags)
+{
+	struct btree_iter iter, slot = { NULL };
+	struct bkey_s_c k;
+	bool found = false;
+	int ret;
+
+	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+			   SPOS(insert->k.p.inode,
+				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+				snapshot),
+			   POS(insert->k.p.inode, U64_MAX),
+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+		if (is_visible_key(desc, inum, k)) {
+			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+				goto found;
+
+			/* hash collision: */
+			continue;
+		}
+
+		if (!slot.path &&
+		    !(flags & BCH_HASH_SET_MUST_REPLACE))
+			bch2_trans_copy_iter(&slot, &iter);
+
+		if (k.k->type != KEY_TYPE_hash_whiteout)
+			goto not_found;
+	}
+
+	if (!ret)
+		ret = -BCH_ERR_ENOSPC_str_hash_create;
+out:
+	bch2_trans_iter_exit(trans, &slot);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+found:
+	found = true;
+not_found:
+
+	if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+		ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+	} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+		ret = -EEXIST;
+	} else {
+		if (!found && slot.path)
+			swap(iter, slot);
+
+		insert->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, insert, 0);
+	}
+
+	goto out;
+}
+
+static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+		  const struct bch_hash_desc desc,
+		  const struct bch_hash_info *info,
+		  subvol_inum inum,
+		  struct bkey_i *insert, int flags)
+{
+	u32 snapshot;
+	int ret;
+
+	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	if (ret)
+		return ret;
+
+	insert->k.p.inode = inum.inum;
+
+	return bch2_hash_set_snapshot(trans, desc, info, inum,
+				      snapshot, insert, flags, 0);
+}
+
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+			const struct bch_hash_desc desc,
+			const struct bch_hash_info *info,
+			struct btree_iter *iter,
+			unsigned update_flags)
+{
+	struct bkey_i *delete;
+	int ret;
+
+	delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+	ret = PTR_ERR_OR_ZERO(delete);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
+	if (ret < 0)
+		return ret;
+
+	bkey_init(&delete->k);
+	delete->k.p = iter->pos;
+	delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
+
+	return bch2_trans_update(trans, iter, delete, update_flags);
+}
+
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+		     const struct bch_hash_desc desc,
+		     const struct bch_hash_info *info,
+		     subvol_inum inum, const void *key)
+{
+	struct btree_iter iter;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
+				BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
new file mode 100644
index 000000000000..22b34a8e4d6e
--- /dev/null
+++ b/fs/bcachefs/subvolume.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+#include "subvolume.h"
+
+#include <linux/random.h>
+
+static int bch2_subvolume_delete(struct btree_trans *, u32);
+
+static int check_subvol(struct btree_trans *trans,
+			struct btree_iter *iter,
+			struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c_subvolume subvol;
+	struct bch_snapshot snapshot;
+	unsigned snapid;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	subvol = bkey_s_c_to_subvolume(k);
+	snapid = le32_to_cpu(subvol.v->snapshot);
+	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
+
+	if (bch2_err_matches(ret, ENOENT))
+		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+			k.k->p.offset, snapid);
+	if (ret)
+		return ret;
+
+	if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+		bch2_fs_lazy_rw(c);
+
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		if (ret)
+			bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
+		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
+		u32 snapshot_tree;
+		struct bch_snapshot_tree st;
+
+		rcu_read_lock();
+		snapshot_tree = snapshot_t(c, snapshot_root)->tree;
+		rcu_read_unlock();
+
+		ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
+
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+				"%s: snapshot tree %u not found", __func__, snapshot_tree);
+
+		if (ret)
+			return ret;
+
+		if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+				c, subvol_not_master_and_not_snapshot,
+				"subvolume %llu is not set as snapshot but is not master subvolume",
+				k.k->p.offset)) {
+			struct bkey_i_subvolume *s =
+				bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+			ret = PTR_ERR_OR_ZERO(s);
+			if (ret)
+				return ret;
+
+			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
+		}
+	}
+
+fsck_err:
+	return ret;
+}
+
+int bch2_check_subvols(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+			BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+		check_subvol(trans, &iter, k)));
+	if (ret)
+		bch_err_fn(c, ret);
+	return ret;
+}
+
+/* Subvolumes: */
+
+int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
+			   enum bkey_invalid_flags flags, struct printbuf *err)
+{
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+			 bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
+			 subvol_pos_bad,
+			 "invalid pos");
+fsck_err:
+	return ret;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+			    struct bkey_s_c k)
+{
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+	prt_printf(out, "root %llu snapshot id %u",
+		   le64_to_cpu(s.v->inode),
+		   le32_to_cpu(s.v->snapshot));
+
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
+		prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+}
+
+static __always_inline int
+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
+			   bool inconsistent_if_not_found,
+			   int iter_flags,
+			   struct bch_subvolume *s)
+{
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
+					  iter_flags, subvolume, s);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
+				inconsistent_if_not_found,
+				trans->c, "missing subvolume %u", subvol);
+	return ret;
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+		       bool inconsistent_if_not_found,
+		       int iter_flags,
+		       struct bch_subvolume *s)
+{
+	return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
+}
+
+int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
+{
+	struct bch_subvolume s;
+	int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s);
+	if (ret)
+		return ret;
+
+	if (BCH_SUBVOLUME_RO(&s))
+		return -EROFS;
+	return 0;
+}
+
+int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
+{
+	return bch2_trans_do(c, NULL, NULL, 0,
+		bch2_subvol_is_ro_trans(trans, subvol));
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+			     struct bch_subvolume *subvol)
+{
+	struct bch_snapshot snap;
+
+	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
+		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				u32 *snapid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	int ret;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+					  BTREE_ID_subvolumes, POS(0, subvolid),
+					  BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+					  subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+
+	if (likely(!ret))
+		*snapid = le32_to_cpu(subvol.v->snapshot);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_subvolume_reparent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k,
+				   u32 old_parent, u32 new_parent)
+{
+	struct bkey_i_subvolume *s;
+	int ret;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+		return 0;
+
+	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.parent = cpu_to_le32(new_parent);
+	return 0;
+}
+
+/*
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
+ */
+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_subvolume s;
+
+	return lockrestart_do(trans,
+			bch2_subvolume_get(trans, subvolid_to_delete, true,
+				   BTREE_ITER_CACHED, &s)) ?:
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BTREE_INSERT_NOFAIL,
+			bch2_subvolume_reparent(trans, &iter, k,
+					subvolid_to_delete, le32_to_cpu(s.parent)));
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c_subvolume subvol;
+	u32 snapid;
+	int ret = 0;
+
+	subvol = bch2_bkey_get_iter_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, subvolid),
+				BTREE_ITER_CACHED|BTREE_ITER_INTENT,
+				subvolume);
+	ret = bkey_err(subvol);
+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+				"missing subvolume %u", subvolid);
+	if (ret)
+		return ret;
+
+	snapid = le32_to_cpu(subvol.v->snapshot);
+
+	ret =   bch2_btree_delete_at(trans, &iter, 0) ?:
+		bch2_snapshot_node_set_deleted(trans, snapid);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+	return bch2_subvolumes_reparent(trans, subvolid) ?:
+		commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+			  __bch2_subvolume_delete(trans, subvolid));
+}
+
+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs,
+				snapshot_wait_for_pagecache_and_delete_work);
+	snapshot_id_list s;
+	u32 *id;
+	int ret = 0;
+
+	while (!ret) {
+		mutex_lock(&c->snapshots_unlinked_lock);
+		s = c->snapshots_unlinked;
+		darray_init(&c->snapshots_unlinked);
+		mutex_unlock(&c->snapshots_unlinked_lock);
+
+		if (!s.nr)
+			break;
+
+		bch2_evict_subvolume_inodes(c, &s);
+
+		for (id = s.data; id < s.data + s.nr; id++) {
+			ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
+			if (ret) {
+				bch_err_msg(c, ret, "deleting subvolume %u", *id);
+				break;
+			}
+		}
+
+		darray_exit(&s);
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+}
+
+struct subvolume_unlink_hook {
+	struct btree_trans_commit_hook	h;
+	u32				subvol;
+};
+
+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+						      struct btree_trans_commit_hook *_h)
+{
+	struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+	struct bch_fs *c = trans->c;
+	int ret = 0;
+
+	mutex_lock(&c->snapshots_unlinked_lock);
+	if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+		ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
+	mutex_unlock(&c->snapshots_unlinked_lock);
+
+	if (ret)
+		return ret;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+		return -EROFS;
+
+	if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+	return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *n;
+	struct subvolume_unlink_hook *h;
+	int ret = 0;
+
+	h = bch2_trans_kmalloc(trans, sizeof(*h));
+	ret = PTR_ERR_OR_ZERO(h);
+	if (ret)
+		return ret;
+
+	h->h.fn		= bch2_subvolume_wait_for_pagecache_and_delete_hook;
+	h->subvol	= subvolid;
+	bch2_trans_commit_hook(trans, &h->h);
+
+	n = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvolid),
+			BTREE_ITER_CACHED, subvolume);
+	ret = PTR_ERR_OR_ZERO(n);
+	if (unlikely(ret)) {
+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+					"missing subvolume %u", subvolid);
+		return ret;
+	}
+
+	SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 src_subvolid,
+			  u32 *new_subvolid,
+			  u32 *new_snapshotid,
+			  bool ro)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+	struct bkey_i_subvolume *new_subvol = NULL;
+	struct bkey_i_subvolume *src_subvol = NULL;
+	u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+	int ret = 0;
+
+	ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
+				BTREE_ID_subvolumes, POS(0, U32_MAX));
+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
+		ret = -BCH_ERR_ENOSPC_subvolume_create;
+	if (ret)
+		return ret;
+
+	snapshot_subvols[0] = dst_iter.pos.offset;
+	snapshot_subvols[1] = src_subvolid;
+
+	if (src_subvolid) {
+		/* Creating a snapshot: */
+
+		src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
+				BTREE_ID_subvolumes, POS(0, src_subvolid),
+				BTREE_ITER_CACHED, subvolume);
+		ret = PTR_ERR_OR_ZERO(src_subvol);
+		if (unlikely(ret)) {
+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+						"subvolume %u not found", src_subvolid);
+			goto err;
+		}
+
+		parent = le32_to_cpu(src_subvol->v.snapshot);
+	}
+
+	ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+					snapshot_subvols,
+					src_subvolid ? 2 : 1);
+	if (ret)
+		goto err;
+
+	if (src_subvolid) {
+		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+		if (ret)
+			goto err;
+	}
+
+	new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
+	ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		goto err;
+
+	new_subvol->v.flags	= 0;
+	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode	= cpu_to_le64(inode);
+	new_subvol->v.parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.otime.lo	= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi	= 0;
+
+	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+
+	*new_subvolid	= new_subvol->k.p.offset;
+	*new_snapshotid	= new_nodes[0];
+err:
+	bch2_trans_iter_exit(trans, &src_iter);
+	bch2_trans_iter_exit(trans, &dst_iter);
+	return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+	INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+		  bch2_subvolume_wait_for_pagecache_and_delete);
+	mutex_init(&c->snapshots_unlinked_lock);
+	return 0;
+}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
new file mode 100644
index 000000000000..a6f56f66e27c
--- /dev/null
+++ b/fs/bcachefs/subvolume.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "darray.h"
+#include "subvolume_types.h"
+
+enum bkey_invalid_flags;
+
+int bch2_check_subvols(struct bch_fs *);
+
+int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
+			   enum bkey_invalid_flags, struct printbuf *);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
+	.key_invalid	= bch2_subvolume_invalid,		\
+	.val_to_text	= bch2_subvolume_to_text,		\
+	.min_val_size	= 16,					\
+})
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+		       bool, int, struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
+int bch2_subvol_is_ro(struct bch_fs *, u32);
+
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+			  u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
new file mode 100644
index 000000000000..2d2e66a4e468
--- /dev/null
+++ b/fs/bcachefs/subvolume_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+#include "darray.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP	128
+
+struct snapshot_t {
+	u32			parent;
+	u32			skip[3];
+	u32			depth;
+	u32			children[2];
+	u32			subvol; /* Nonzero only if a subvolume points to this node: */
+	u32			tree;
+	u32			equiv;
+	unsigned long		is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+};
+
+typedef struct {
+	u32		subvol;
+	u64		inum;
+} subvol_inum;
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
new file mode 100644
index 000000000000..4c98d8cc2a79
--- /dev/null
+++ b/fs/bcachefs/super-io.c
@@ -0,0 +1,1353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "checksum.h"
+#include "counters.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_sb.h"
+#include "journal_seq_blacklist.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "quota.h"
+#include "sb-clean.h"
+#include "sb-downgrade.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "super-io.h"
+#include "super.h"
+#include "trace.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
+struct bch2_metadata_version {
+	u16		version;
+	const char	*name;
+	u64		recovery_passes;
+};
+
+static const struct bch2_metadata_version bch2_metadata_versions[] = {
+#define x(n, v, _recovery_passes) {		\
+	.version = v,				\
+	.name = #n,				\
+	.recovery_passes = _recovery_passes,	\
+},
+	BCH_METADATA_VERSIONS()
+#undef x
+};
+
+void bch2_version_to_text(struct printbuf *out, unsigned v)
+{
+	const char *str = "(unknown version)";
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version == v) {
+			str = bch2_metadata_versions[i].name;
+			break;
+		}
+
+	prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str);
+}
+
+unsigned bch2_latest_compatible_version(unsigned v)
+{
+	if (!BCH_VERSION_MAJOR(v))
+		return v;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++)
+		if (bch2_metadata_versions[i].version > v &&
+		    BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) ==
+		    BCH_VERSION_MAJOR(v))
+			v = bch2_metadata_versions[i].version;
+
+	return v;
+}
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned old_version,
+				 unsigned new_version)
+{
+	u64 ret = 0;
+
+	for (const struct bch2_metadata_version *i = bch2_metadata_versions;
+	     i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
+	     i++)
+		if (i->version > old_version && i->version <= new_version) {
+			if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
+				ret |= bch2_fsck_recovery_passes();
+			ret |= i->recovery_passes;
+		}
+
+	return ret &= ~RECOVERY_PASS_ALL_FSCK;
+}
+
+const char * const bch2_sb_fields[] = {
+#define x(name, nr)	#name,
+	BCH_SB_FIELDS()
+#undef x
+	NULL
+};
+
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+				  struct printbuf *);
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
+				      enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f;
+
+	/* XXX: need locking around superblock to access optional fields */
+
+	vstruct_for_each(sb, f)
+		if (le32_to_cpu(f->type) == type)
+			return f;
+	return NULL;
+}
+
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+						   struct bch_sb_field *f,
+						   unsigned u64s)
+{
+	unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+	BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
+
+	if (!f && !u64s) {
+		/* nothing to do: */
+	} else if (!f) {
+		f = vstruct_last(sb->sb);
+		memset(f, 0, sizeof(u64) * u64s);
+		f->u64s = cpu_to_le32(u64s);
+		f->type = 0;
+	} else {
+		void *src, *dst;
+
+		src = vstruct_end(f);
+
+		if (u64s) {
+			f->u64s = cpu_to_le32(u64s);
+			dst = vstruct_end(f);
+		} else {
+			dst = f;
+		}
+
+		memmove(dst, src, vstruct_end(sb->sb) - src);
+
+		if (dst > src)
+			memset(src, 0, dst - src);
+	}
+
+	sb->sb->u64s = cpu_to_le32(sb_u64s);
+
+	return u64s ? f : NULL;
+}
+
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
+			  enum bch_sb_field_type type)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+	if (f)
+		__bch2_sb_field_resize(sb, f, 0);
+}
+
+/* Superblock realloc/free: */
+
+void bch2_free_super(struct bch_sb_handle *sb)
+{
+	kfree(sb->bio);
+	if (!IS_ERR_OR_NULL(sb->bdev))
+		blkdev_put(sb->bdev, sb->holder);
+	kfree(sb->holder);
+	kfree(sb->sb_name);
+
+	kfree(sb->sb);
+	memset(sb, 0, sizeof(*sb));
+}
+
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
+{
+	size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+	size_t new_buffer_size;
+	struct bch_sb *new_sb;
+	struct bio *bio;
+
+	if (sb->bdev)
+		new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
+
+	new_buffer_size = roundup_pow_of_two(new_bytes);
+
+	if (sb->sb && sb->buffer_size >= new_buffer_size)
+		return 0;
+
+	if (sb->sb && sb->have_layout) {
+		u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+		if (new_bytes > max_bytes) {
+			pr_err("%pg: superblock too big: want %zu but have %llu",
+			       sb->bdev, new_bytes, max_bytes);
+			return -BCH_ERR_ENOSPC_sb;
+		}
+	}
+
+	if (sb->buffer_size >= new_buffer_size && sb->sb)
+		return 0;
+
+	if (dynamic_fault("bcachefs:add:super_realloc"))
+		return -BCH_ERR_ENOMEM_sb_realloc_injected;
+
+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+	if (!new_sb)
+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+	sb->sb = new_sb;
+
+	if (sb->have_bio) {
+		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
+
+		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+		if (!bio)
+			return -BCH_ERR_ENOMEM_sb_bio_realloc;
+
+		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
+
+		kfree(sb->bio);
+		sb->bio = bio;
+	}
+
+	sb->buffer_size = new_buffer_size;
+
+	return 0;
+}
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
+					  enum bch_sb_field_type type,
+					  unsigned u64s)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+	ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+	ssize_t d = -old_u64s + u64s;
+
+	if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+		return NULL;
+
+	if (sb->fs_sb) {
+		struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+		struct bch_dev *ca;
+		unsigned i;
+
+		lockdep_assert_held(&c->sb_lock);
+
+		/* XXX: we're not checking that offline device have enough space */
+
+		for_each_online_member(ca, c, i) {
+			struct bch_sb_handle *dev_sb = &ca->disk_sb;
+
+			if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
+				percpu_ref_put(&ca->ref);
+				return NULL;
+			}
+		}
+	}
+
+	f = bch2_sb_field_get_id(sb->sb, type);
+	f = __bch2_sb_field_resize(sb, f, u64s);
+	if (f)
+		f->type = cpu_to_le32(type);
+	return f;
+}
+
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb,
+						  enum bch_sb_field_type type,
+						  unsigned u64s)
+{
+	struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
+
+	if (!f || le32_to_cpu(f->u64s) < u64s)
+		f = bch2_sb_field_resize_id(sb, type, u64s);
+	return f;
+}
+
+/* Superblock validate: */
+
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
+{
+	u64 offset, prev_offset, max_sectors;
+	unsigned i;
+
+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+
+	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
+		prt_printf(out, "Not a bcachefs superblock layout");
+		return -BCH_ERR_invalid_sb_layout;
+	}
+
+	if (layout->layout_type != 0) {
+		prt_printf(out, "Invalid superblock layout type %u",
+		       layout->layout_type);
+		return -BCH_ERR_invalid_sb_layout_type;
+	}
+
+	if (!layout->nr_superblocks) {
+		prt_printf(out, "Invalid superblock layout: no superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
+
+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+		prt_printf(out, "Invalid superblock layout: too many superblocks");
+		return -BCH_ERR_invalid_sb_layout_nr_superblocks;
+	}
+
+	max_sectors = 1 << layout->sb_max_size_bits;
+
+	prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+	for (i = 1; i < layout->nr_superblocks; i++) {
+		offset = le64_to_cpu(layout->sb_offset[i]);
+
+		if (offset < prev_offset + max_sectors) {
+			prt_printf(out, "Invalid superblock layout: superblocks overlap\n"
+			       "  (sb %u ends at %llu next starts at %llu",
+			       i - 1, prev_offset + max_sectors, offset);
+			return -BCH_ERR_invalid_sb_layout_superblocks_overlap;
+		}
+		prev_offset = offset;
+	}
+
+	return 0;
+}
+
+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
+{
+	u16 version		= le16_to_cpu(sb->version);
+	u16 version_min		= le16_to_cpu(sb->version_min);
+
+	if (!bch2_version_compatible(version)) {
+		prt_str(out, "Unsupported superblock version ");
+		bch2_version_to_text(out, version);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	if (!bch2_version_compatible(version_min)) {
+		prt_str(out, "Unsupported superblock version_min ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, " (min ");
+		bch2_version_to_text(out, bcachefs_metadata_version_min);
+		prt_str(out, ", max ");
+		bch2_version_to_text(out, bcachefs_metadata_version_current);
+		prt_str(out, ")");
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	if (version_min > version) {
+		prt_str(out, "Bad minimum version ");
+		bch2_version_to_text(out, version_min);
+		prt_str(out, ", greater than version field ");
+		bch2_version_to_text(out, version);
+		return -BCH_ERR_invalid_sb_version;
+	}
+
+	return 0;
+}
+
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
+			    int rw)
+{
+	struct bch_sb *sb = disk_sb->sb;
+	struct bch_sb_field *f;
+	struct bch_sb_field_members_v1 *mi;
+	enum bch_opt_id opt_id;
+	u16 block_size;
+	int ret;
+
+	ret = bch2_sb_compatible(sb, out);
+	if (ret)
+		return ret;
+
+	if (sb->features[1] ||
+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+		prt_printf(out, "Filesystem has incompatible features");
+		return -BCH_ERR_invalid_sb_features;
+	}
+
+	block_size = le16_to_cpu(sb->block_size);
+
+	if (block_size > PAGE_SECTORS) {
+		prt_printf(out, "Block size too big (got %u, max %u)",
+		       block_size, PAGE_SECTORS);
+		return -BCH_ERR_invalid_sb_block_size;
+	}
+
+	if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
+		prt_printf(out, "Bad user UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
+
+	if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
+		prt_printf(out, "Bad internal UUID (got zeroes)");
+		return -BCH_ERR_invalid_sb_uuid;
+	}
+
+	if (!sb->nr_devices ||
+	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+		prt_printf(out, "Bad number of member devices %u (max %u)",
+		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
+		return -BCH_ERR_invalid_sb_too_many_members;
+	}
+
+	if (sb->dev_idx >= sb->nr_devices) {
+		prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)",
+		       sb->dev_idx, sb->nr_devices);
+		return -BCH_ERR_invalid_sb_dev_idx;
+	}
+
+	if (!sb->time_precision ||
+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+		prt_printf(out, "Invalid time precision: %u (min 1, max %lu)",
+		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+		return -BCH_ERR_invalid_sb_time_precision;
+	}
+
+	if (rw == READ) {
+		/*
+		 * Been seeing a bug where these are getting inexplicably
+		 * zeroed, so we're now validating them, but we have to be
+		 * careful not to preven people's filesystems from mounting:
+		 */
+		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
+
+		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
+			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+	}
+
+	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
+		const struct bch_option *opt = bch2_opt_table + opt_id;
+
+		if (opt->get_sb != BCH2_NO_SB_OPT) {
+			u64 v = bch2_opt_from_sb(sb, opt_id);
+
+			prt_printf(out, "Invalid option ");
+			ret = bch2_opt_validate(opt, v, out);
+			if (ret)
+				return ret;
+
+			printbuf_reset(out);
+		}
+	}
+
+	/* validate layout */
+	ret = validate_sb_layout(&sb->layout, out);
+	if (ret)
+		return ret;
+
+	vstruct_for_each(sb, f) {
+		if (!f->u64s) {
+			prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
+
+		if (vstruct_next(f) > vstruct_last(sb)) {
+			prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+			       le32_to_cpu(f->type));
+			return -BCH_ERR_invalid_sb_field_size;
+		}
+	}
+
+	/* members must be validated first: */
+	mi = bch2_sb_field_get(sb, members_v1);
+	if (!mi) {
+		prt_printf(out, "Invalid superblock: member info area missing");
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
+
+	ret = bch2_sb_field_validate(sb, &mi->field, out);
+	if (ret)
+		return ret;
+
+	vstruct_for_each(sb, f) {
+		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
+			continue;
+
+		ret = bch2_sb_field_validate(sb, f, out);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* device open: */
+
+static unsigned long le_ulong_to_cpu(unsigned long v)
+{
+	return sizeof(unsigned long) == 8
+		? le64_to_cpu(v)
+		: le32_to_cpu(v);
+}
+
+static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr)
+{
+	BUG_ON(nr & (BITS_PER_TYPE(long) - 1));
+
+	for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++)
+		dst[i] = le_ulong_to_cpu(src[i]);
+}
+
+static void bch2_sb_update(struct bch_fs *c)
+{
+	struct bch_sb *src = c->disk_sb.sb;
+	struct bch_dev *ca;
+	unsigned i;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	c->sb.uuid		= src->uuid;
+	c->sb.user_uuid		= src->user_uuid;
+	c->sb.version		= le16_to_cpu(src->version);
+	c->sb.version_min	= le16_to_cpu(src->version_min);
+	c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src);
+	c->sb.nr_devices	= src->nr_devices;
+	c->sb.clean		= BCH_SB_CLEAN(src);
+	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
+
+	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
+	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
+
+	/* XXX this is wrong, we need a 96 or 128 bit integer type */
+	c->sb.time_base_lo	= div_u64(le64_to_cpu(src->time_base_lo),
+					  c->sb.nsec_per_time_unit);
+	c->sb.time_base_hi	= le32_to_cpu(src->time_base_hi);
+
+	c->sb.features		= le64_to_cpu(src->features[0]);
+	c->sb.compat		= le64_to_cpu(src->compat[0]);
+
+	memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
+
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
+	if (ext)
+		le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
+				    sizeof(c->sb.errors_silent) * 8);
+
+	for_each_member_device(ca, c, i) {
+		struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
+		ca->mi = bch2_mi_to_cpu(&m);
+	}
+}
+
+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
+{
+	struct bch_sb_field *src_f, *dst_f;
+	struct bch_sb *dst = dst_handle->sb;
+	unsigned i;
+
+	dst->version		= src->version;
+	dst->version_min	= src->version_min;
+	dst->seq		= src->seq;
+	dst->uuid		= src->uuid;
+	dst->user_uuid		= src->user_uuid;
+	memcpy(dst->label,	src->label, sizeof(dst->label));
+
+	dst->block_size		= src->block_size;
+	dst->nr_devices		= src->nr_devices;
+
+	dst->time_base_lo	= src->time_base_lo;
+	dst->time_base_hi	= src->time_base_hi;
+	dst->time_precision	= src->time_precision;
+
+	memcpy(dst->flags,	src->flags,	sizeof(dst->flags));
+	memcpy(dst->features,	src->features,	sizeof(dst->features));
+	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
+
+	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
+		int d;
+
+		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
+			continue;
+
+		src_f = bch2_sb_field_get_id(src, i);
+		dst_f = bch2_sb_field_get_id(dst, i);
+
+		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
+		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
+		if (d > 0) {
+			int ret = bch2_sb_realloc(dst_handle,
+					le32_to_cpu(dst_handle->sb->u64s) + d);
+
+			if (ret)
+				return ret;
+
+			dst = dst_handle->sb;
+			dst_f = bch2_sb_field_get_id(dst, i);
+		}
+
+		dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+				src_f ? le32_to_cpu(src_f->u64s) : 0);
+
+		if (src_f)
+			memcpy(dst_f, src_f, vstruct_bytes(src_f));
+	}
+
+	return 0;
+}
+
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
+{
+	int ret;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	ret =   bch2_sb_realloc(&c->disk_sb, 0) ?:
+		__copy_super(&c->disk_sb, src) ?:
+		bch2_sb_replicas_to_cpu_replicas(c) ?:
+		bch2_sb_disk_groups_to_cpu(c);
+	if (ret)
+		return ret;
+
+	bch2_sb_update(c);
+	return 0;
+}
+
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __copy_super(&ca->disk_sb, c->disk_sb.sb);
+}
+
+/* read superblock: */
+
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
+{
+	struct bch_csum csum;
+	size_t bytes;
+	int ret;
+reread:
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = offset;
+	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
+
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(err, "IO error: %i", ret);
+		return ret;
+	}
+
+	if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
+	    !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
+		prt_printf(err, "Not a bcachefs superblock");
+		return -BCH_ERR_invalid_sb_magic;
+	}
+
+	ret = bch2_sb_compatible(sb->sb, err);
+	if (ret)
+		return ret;
+
+	bytes = vstruct_bytes(sb->sb);
+
+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+		prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+		return -BCH_ERR_invalid_sb_too_big;
+	}
+
+	if (bytes > sb->buffer_size) {
+		ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s));
+		if (ret)
+			return ret;
+		goto reread;
+	}
+
+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+		prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+		return -BCH_ERR_invalid_sb_csum_type;
+	}
+
+	/* XXX: verify MACs */
+	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+			    null_nonce(), sb->sb);
+
+	if (bch2_crc_cmp(csum, sb->sb->csum)) {
+		prt_printf(err, "bad checksum");
+		return -BCH_ERR_invalid_sb_csum;
+	}
+
+	sb->seq = le64_to_cpu(sb->sb->seq);
+
+	return 0;
+}
+
+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	u64 offset = opt_get(*opts, sb);
+	struct bch_sb_layout layout;
+	struct printbuf err = PRINTBUF;
+	__le64 *i;
+	int ret;
+#ifndef __KERNEL__
+retry:
+#endif
+	memset(sb, 0, sizeof(*sb));
+	sb->mode	= BLK_OPEN_READ;
+	sb->have_bio	= true;
+	sb->holder	= kmalloc(1, GFP_KERNEL);
+	if (!sb->holder)
+		return -ENOMEM;
+
+	sb->sb_name = kstrdup(path, GFP_KERNEL);
+	if (!sb->sb_name)
+		return -ENOMEM;
+
+#ifndef __KERNEL__
+	if (opt_get(*opts, direct_io) == false)
+		sb->mode |= BLK_OPEN_BUFFERED;
+#endif
+
+	if (!opt_get(*opts, noexcl))
+		sb->mode |= BLK_OPEN_EXCL;
+
+	if (!opt_get(*opts, nochanges))
+		sb->mode |= BLK_OPEN_WRITE;
+
+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->bdev) &&
+	    PTR_ERR(sb->bdev) == -EACCES &&
+	    opt_get(*opts, read_only)) {
+		sb->mode &= ~BLK_OPEN_WRITE;
+
+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->bdev))
+			opt_set(*opts, nochanges, true);
+	}
+
+	if (IS_ERR(sb->bdev)) {
+		ret = PTR_ERR(sb->bdev);
+		goto out;
+	}
+
+	ret = bch2_sb_realloc(sb, 0);
+	if (ret) {
+		prt_printf(&err, "error allocating memory for superblock");
+		goto err;
+	}
+
+	if (bch2_fs_init_fault("read_super")) {
+		prt_printf(&err, "dynamic fault");
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = read_one_super(sb, offset, &err);
+	if (!ret)
+		goto got_super;
+
+	if (opt_defined(*opts, sb))
+		goto err;
+
+	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+	       path, err.buf);
+	printbuf_reset(&err);
+
+	/*
+	 * Error reading primary superblock - read location of backup
+	 * superblocks:
+	 */
+	bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+	/*
+	 * use sb buffer to read layout, since sb buffer is page aligned but
+	 * layout won't be:
+	 */
+	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
+
+	ret = submit_bio_wait(sb->bio);
+	if (ret) {
+		prt_printf(&err, "IO error: %i", ret);
+		goto err;
+	}
+
+	memcpy(&layout, sb->sb, sizeof(layout));
+	ret = validate_sb_layout(&layout, &err);
+	if (ret)
+		goto err;
+
+	for (i = layout.sb_offset;
+	     i < layout.sb_offset + layout.nr_superblocks; i++) {
+		offset = le64_to_cpu(*i);
+
+		if (offset == opt_get(*opts, sb))
+			continue;
+
+		ret = read_one_super(sb, offset, &err);
+		if (!ret)
+			goto got_super;
+	}
+
+	goto err;
+
+got_super:
+	if (le16_to_cpu(sb->sb->block_size) << 9 <
+	    bdev_logical_block_size(sb->bdev) &&
+	    opt_get(*opts, direct_io)) {
+#ifndef __KERNEL__
+		opt_set(*opts, direct_io, false);
+		bch2_free_super(sb);
+		goto retry;
+#endif
+		prt_printf(&err, "block size (%u) smaller than device block size (%u)",
+		       le16_to_cpu(sb->sb->block_size) << 9,
+		       bdev_logical_block_size(sb->bdev));
+		ret = -BCH_ERR_block_size_too_small;
+		goto err;
+	}
+
+	ret = 0;
+	sb->have_layout = true;
+
+	ret = bch2_sb_validate(sb, &err, READ);
+	if (ret) {
+		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+		       path, err.buf);
+		goto err_no_print;
+	}
+out:
+	printbuf_exit(&err);
+	return ret;
+err:
+	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+	       path, err.buf);
+err_no_print:
+	bch2_free_super(sb);
+	goto out;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+
+	/* XXX: return errors directly */
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca,
+			       bio_data_dir(bio)
+			       ? BCH_MEMBER_ERROR_write
+			       : BCH_MEMBER_ERROR_read,
+			       "superblock %s error: %s",
+			       bio_data_dir(bio) ? "write" : "read",
+			       bch2_blk_status_to_str(bio->bi_status)))
+		ca->sb_write_error = 1;
+
+	closure_put(&ca->fs->sb_write);
+	percpu_ref_put(&ca->io_ref);
+}
+
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->layout.sb_offset[0]);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+{
+	struct bch_sb *sb = ca->disk_sb.sb;
+	struct bio *bio = ca->disk_sb.bio;
+
+	sb->offset = sb->layout.sb_offset[idx];
+
+	SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false));
+	sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+				null_nonce(), sb);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
+	bio->bi_iter.bi_sector	= le64_to_cpu(sb->offset);
+	bio->bi_end_io		= write_super_endio;
+	bio->bi_private		= ca;
+	bch2_bio_map(bio, sb,
+		     roundup((size_t) vstruct_bytes(sb),
+			     bdev_logical_block_size(ca->disk_sb.bdev)));
+
+	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
+		     bio_sectors(bio));
+
+	percpu_ref_get(&ca->io_ref);
+	closure_bio_submit(bio, &c->sb_write);
+}
+
+int bch2_write_super(struct bch_fs *c)
+{
+	struct closure *cl = &c->sb_write;
+	struct bch_dev *ca;
+	struct printbuf err = PRINTBUF;
+	unsigned i, sb = 0, nr_wrote;
+	struct bch_devs_mask sb_written;
+	bool wrote, can_mount_without_written, can_mount_with_written;
+	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
+	int ret = 0;
+
+	trace_and_count(c, write_super, c, _RET_IP_);
+
+	if (c->opts.very_degraded)
+		degraded_flags |= BCH_FORCE_IF_LOST;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	closure_init_stack(cl);
+	memset(&sb_written, 0, sizeof(sb_written));
+
+	/* Make sure we're using the new magic numbers: */
+	c->disk_sb.sb->magic = BCHFS_MAGIC;
+	c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+
+	le64_add_cpu(&c->disk_sb.sb->seq, 1);
+
+	if (test_bit(BCH_FS_ERROR, &c->flags))
+		SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+	if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+		SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
+
+	SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
+	bch2_sb_counters_from_cpu(c);
+	bch2_sb_members_from_cpu(c);
+	bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+	bch2_sb_errors_from_cpu(c);
+	bch2_sb_downgrade_update(c);
+
+	for_each_online_member(ca, c, i)
+		bch2_sb_from_fs(c, ca);
+
+	for_each_online_member(ca, c, i) {
+		printbuf_reset(&err);
+
+		ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
+		if (ret) {
+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
+			percpu_ref_put(&ca->io_ref);
+			goto out;
+		}
+	}
+
+	if (c->opts.nochanges)
+		goto out;
+
+	/*
+	 * Defer writing the superblock until filesystem initialization is
+	 * complete - don't write out a partly initialized superblock:
+	 */
+	if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
+		goto out;
+
+	for_each_online_member(ca, c, i) {
+		__set_bit(ca->dev_idx, sb_written.d);
+		ca->sb_write_error = 0;
+	}
+
+	for_each_online_member(ca, c, i)
+		read_back_super(c, ca);
+	closure_sync(cl);
+
+	for_each_online_member(ca, c, i) {
+		if (ca->sb_write_error)
+			continue;
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+			bch2_fs_fatal_error(c,
+				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			percpu_ref_put(&ca->io_ref);
+			ret = -BCH_ERR_erofs_sb_err;
+			goto out;
+		}
+
+		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
+			bch2_fs_fatal_error(c,
+				"Superblock modified by another process (seq %llu expected %llu)",
+				le64_to_cpu(ca->sb_read_scratch->seq),
+				ca->disk_sb.seq);
+			percpu_ref_put(&ca->io_ref);
+			ret = -BCH_ERR_erofs_sb_err;
+			goto out;
+		}
+	}
+
+	do {
+		wrote = false;
+		for_each_online_member(ca, c, i)
+			if (!ca->sb_write_error &&
+			    sb < ca->disk_sb.sb->layout.nr_superblocks) {
+				write_one_super(c, ca, sb);
+				wrote = true;
+			}
+		closure_sync(cl);
+		sb++;
+	} while (wrote);
+
+	for_each_online_member(ca, c, i) {
+		if (ca->sb_write_error)
+			__clear_bit(ca->dev_idx, sb_written.d);
+		else
+			ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
+	}
+
+	nr_wrote = dev_mask_nr(&sb_written);
+
+	can_mount_with_written =
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+		sb_written.d[i] = ~sb_written.d[i];
+
+	can_mount_without_written =
+		bch2_have_enough_devs(c, sb_written, degraded_flags, false);
+
+	/*
+	 * If we would be able to mount _without_ the devices we successfully
+	 * wrote superblocks to, we weren't able to write to enough devices:
+	 *
+	 * Exception: if we can mount without the successes because we haven't
+	 * written anything (new filesystem), we continue if we'd be able to
+	 * mount with the devices we did successfully write to:
+	 */
+	if (bch2_fs_fatal_err_on(!nr_wrote ||
+				 !can_mount_with_written ||
+				 (can_mount_without_written &&
+				  !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices (from %ps)",
+		(void *) _RET_IP_))
+		ret = -1;
+out:
+	/* Make new options visible after they're persistent: */
+	bch2_sb_update(c);
+	printbuf_exit(&err);
+	return ret;
+}
+
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	mutex_lock(&c->sb_lock);
+	if (!(c->sb.features & (1ULL << feat))) {
+		c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
+
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
+/* Downgrade if superblock is at a higher version than currently supported: */
+bool bch2_check_version_downgrade(struct bch_fs *c)
+{
+	bool ret = bcachefs_metadata_version_current < c->sb.version;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	/*
+	 * Downgrade, if superblock is at a higher version than currently
+	 * supported:
+	 */
+	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
+	if (c->sb.version > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+	if (c->sb.version_min > bcachefs_metadata_version_current)
+		c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+	c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
+	return ret;
+}
+
+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	if (BCH_VERSION_MAJOR(new_version) >
+	    BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
+		bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
+
+	c->disk_sb.sb->version = cpu_to_le16(new_version);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+}
+
+static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				struct printbuf *err)
+{
+	if (vstruct_bytes(f) < 88) {
+		prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88);
+		return -BCH_ERR_invalid_sb_ext;
+	}
+
+	return 0;
+}
+
+static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
+				struct bch_sb_field *f)
+{
+	struct bch_sb_field_ext *e = field_to_type(f, ext);
+
+	prt_printf(out, "Recovery passes required:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_recovery_passes,
+		     bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0])));
+	prt_newline(out);
+
+	unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL);
+	if (errors_silent) {
+		le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
+
+		prt_printf(out, "Errors to silently fix:");
+		prt_tab(out);
+		prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+		prt_newline(out);
+
+		kfree(errors_silent);
+	}
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
+	.validate	= bch2_sb_ext_validate,
+	.to_text	= bch2_sb_ext_to_text,
+};
+
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)					\
+	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+	BCH_SB_FIELDS()
+#undef x
+};
+
+static const struct bch_sb_field_ops bch2_sb_field_null_ops;
+
+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type)
+{
+	return likely(type < ARRAY_SIZE(bch2_sb_field_ops))
+		? bch2_sb_field_ops[type]
+		: &bch2_sb_field_null_ops;
+}
+
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+				  struct printbuf *err)
+{
+	unsigned type = le32_to_cpu(f->type);
+	struct printbuf field_err = PRINTBUF;
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+	int ret;
+
+	ret = ops->validate ? ops->validate(sb, f, &field_err) : 0;
+	if (ret) {
+		prt_printf(err, "Invalid superblock section %s: %s",
+			   bch2_sb_fields[type], field_err.buf);
+		prt_newline(err);
+		bch2_sb_field_to_text(err, sb, f);
+	}
+
+	printbuf_exit(&field_err);
+	return ret;
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+			   struct bch_sb_field *f)
+{
+	unsigned type = le32_to_cpu(f->type);
+	const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 32);
+
+	if (type < BCH_SB_FIELD_NR)
+		prt_printf(out, "%s", bch2_sb_fields[type]);
+	else
+		prt_printf(out, "(unknown field %u)", type);
+
+	prt_printf(out, " (size %zu):", vstruct_bytes(f));
+	prt_newline(out);
+
+	if (ops->to_text) {
+		printbuf_indent_add(out, 2);
+		ops->to_text(out, sb, f);
+		printbuf_indent_sub(out, 2);
+	}
+}
+
+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
+{
+	unsigned i;
+
+	prt_printf(out, "Type:                    %u", l->layout_type);
+	prt_newline(out);
+
+	prt_str(out, "Superblock max size:     ");
+	prt_units_u64(out, 512 << l->sb_max_size_bits);
+	prt_newline(out);
+
+	prt_printf(out, "Nr superblocks:          %u", l->nr_superblocks);
+	prt_newline(out);
+
+	prt_str(out, "Offsets:                 ");
+	for (i = 0; i < l->nr_superblocks; i++) {
+		if (i)
+			prt_str(out, ", ");
+		prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
+	}
+	prt_newline(out);
+}
+
+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
+		     bool print_layout, unsigned fields)
+{
+	struct bch_sb_field *f;
+	u64 fields_have = 0;
+	unsigned nr_devices = 0;
+
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 44);
+
+	for (int i = 0; i < sb->nr_devices; i++)
+		nr_devices += bch2_dev_exists(sb, i);
+
+	prt_printf(out, "External UUID:");
+	prt_tab(out);
+	pr_uuid(out, sb->user_uuid.b);
+	prt_newline(out);
+
+	prt_printf(out, "Internal UUID:");
+	prt_tab(out);
+	pr_uuid(out, sb->uuid.b);
+	prt_newline(out);
+
+	prt_str(out, "Device index:");
+	prt_tab(out);
+	prt_printf(out, "%u", sb->dev_idx);
+	prt_newline(out);
+
+	prt_str(out, "Label:");
+	prt_tab(out);
+	prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+	prt_newline(out);
+
+	prt_str(out, "Version:");
+	prt_tab(out);
+	bch2_version_to_text(out, le16_to_cpu(sb->version));
+	prt_newline(out);
+
+	prt_str(out, "Version upgrade complete:");
+	prt_tab(out);
+	bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Oldest version on disk:");
+	prt_tab(out);
+	bch2_version_to_text(out, le16_to_cpu(sb->version_min));
+	prt_newline(out);
+
+	prt_printf(out, "Created:");
+	prt_tab(out);
+	if (sb->time_base_lo)
+		bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+	else
+		prt_printf(out, "(not set)");
+	prt_newline(out);
+
+	prt_printf(out, "Sequence number:");
+	prt_tab(out);
+	prt_printf(out, "%llu", le64_to_cpu(sb->seq));
+	prt_newline(out);
+
+	prt_printf(out, "Superblock size:");
+	prt_tab(out);
+	prt_printf(out, "%zu", vstruct_bytes(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Clean:");
+	prt_tab(out);
+	prt_printf(out, "%llu", BCH_SB_CLEAN(sb));
+	prt_newline(out);
+
+	prt_printf(out, "Devices:");
+	prt_tab(out);
+	prt_printf(out, "%u", nr_devices);
+	prt_newline(out);
+
+	prt_printf(out, "Sections:");
+	vstruct_for_each(sb, f)
+		fields_have |= 1 << le32_to_cpu(f->type);
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_fields, fields_have);
+	prt_newline(out);
+
+	prt_printf(out, "Features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0]));
+	prt_newline(out);
+
+	prt_printf(out, "Compat features:");
+	prt_tab(out);
+	prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0]));
+	prt_newline(out);
+
+	prt_newline(out);
+	prt_printf(out, "Options:");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+	{
+		enum bch_opt_id id;
+
+		for (id = 0; id < bch2_opts_nr; id++) {
+			const struct bch_option *opt = bch2_opt_table + id;
+
+			if (opt->get_sb != BCH2_NO_SB_OPT) {
+				u64 v = bch2_opt_from_sb(sb, id);
+
+				prt_printf(out, "%s:", opt->attr.name);
+				prt_tab(out);
+				bch2_opt_to_text(out, NULL, sb, opt, v,
+						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
+				prt_newline(out);
+			}
+		}
+	}
+
+	printbuf_indent_sub(out, 2);
+
+	if (print_layout) {
+		prt_newline(out);
+		prt_printf(out, "layout:");
+		prt_newline(out);
+		printbuf_indent_add(out, 2);
+		bch2_sb_layout_to_text(out, &sb->layout);
+		printbuf_indent_sub(out, 2);
+	}
+
+	vstruct_for_each(sb, f)
+		if (fields & (1 << le32_to_cpu(f->type))) {
+			prt_newline(out);
+			bch2_sb_field_to_text(out, sb, f);
+		}
+}
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
new file mode 100644
index 000000000000..e41e5de531a0
--- /dev/null
+++ b/fs/bcachefs/super-io.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_IO_H
+#define _BCACHEFS_SUPER_IO_H
+
+#include "extents.h"
+#include "eytzinger.h"
+#include "super_types.h"
+#include "super.h"
+#include "sb-members.h"
+
+#include <asm/byteorder.h>
+
+static inline bool bch2_version_compatible(u16 version)
+{
+	return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) &&
+		version >= bcachefs_metadata_version_min;
+}
+
+void bch2_version_to_text(struct printbuf *, unsigned);
+unsigned bch2_latest_compatible_version(unsigned);
+
+u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
+				 unsigned,
+				 unsigned);
+
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+	return le32_to_cpu(f->u64s) * sizeof(u64);
+}
+
+#define field_to_type(_f, _name)					\
+	container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name)					\
+	field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+					     enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *,
+					enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_get_minsize(_sb, _name, _u64s)				\
+	field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
+
+#define bch2_sb_field_nr_entries(_f)					\
+	(_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) /	\
+	       sizeof(_f->entries[0]))					\
+	    : 0)
+
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+
+extern const char * const bch2_sb_fields[];
+
+struct bch_sb_field_ops {
+	int	(*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
+};
+
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
+{
+	__le64 ret;
+
+	memcpy(&ret, &c->sb.uuid, sizeof(ret));
+	return ret;
+}
+
+static inline __u64 jset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct bch_fs *c)
+{
+	return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
+}
+
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
+
+void bch2_free_super(struct bch_sb_handle *);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_write_super(struct bch_fs *);
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
+
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
+{
+	if (!(c->sb.features & (1ULL << feat)))
+		__bch2_check_set_feature(c, feat);
+}
+
+bool bch2_check_version_downgrade(struct bch_fs *);
+void bch2_sb_upgrade(struct bch_fs *, unsigned);
+
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+			   struct bch_sb_field *);
+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
+
+#endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
new file mode 100644
index 000000000000..818ec467a06b
--- /dev/null
+++ b/fs/bcachefs/super.c
@@ -0,0 +1,2030 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "bkey_sort.h"
+#include "btree_cache.h"
+#include "btree_gc.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_write_buffer.h"
+#include "buckets_waiting_for_journal.h"
+#include "chardev.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "counters.h"
+#include "debug.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io_read.h"
+#include "io_write.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "move.h"
+#include "migrate.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "quota.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "sysfs.h"
+#include "trace.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/random.h>
+#include <linux/sysfs.h>
+#include <crypto/hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
+MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: crc64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: chacha20");
+MODULE_SOFTDEP("pre: poly1305");
+MODULE_SOFTDEP("pre: xxhash");
+
+#define KTYPE(type)							\
+static const struct attribute_group type ## _group = {			\
+	.attrs = type ## _files						\
+};									\
+									\
+static const struct attribute_group *type ## _groups[] = {		\
+	&type ## _group,						\
+	NULL								\
+};									\
+									\
+static const struct kobj_type type ## _ktype = {			\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &type ## _sysfs_ops,				\
+	.default_groups = type ## _groups				\
+}
+
+static void bch2_fs_release(struct kobject *);
+static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_internal_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_opts_dir_release(struct kobject *k)
+{
+}
+
+static void bch2_fs_time_stats_release(struct kobject *k)
+{
+}
+
+KTYPE(bch2_fs);
+KTYPE(bch2_fs_counters);
+KTYPE(bch2_fs_internal);
+KTYPE(bch2_fs_opts_dir);
+KTYPE(bch2_fs_time_stats);
+KTYPE(bch2_dev);
+
+static struct kset *bcachefs_kset;
+static LIST_HEAD(bch_fs_list);
+static DEFINE_MUTEX(bch_fs_list_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+
+static void bch2_dev_free(struct bch_dev *);
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
+{
+	struct bch_fs *c;
+	struct bch_dev *ca;
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	rcu_read_lock();
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		for_each_member_device_rcu(ca, c, i, NULL)
+			if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
+				closure_get(&c->cl);
+				goto found;
+			}
+	c = NULL;
+found:
+	rcu_read_unlock();
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	list_for_each_entry(c, &bch_fs_list, list)
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid)))
+			return c;
+
+	return NULL;
+}
+
+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
+{
+	struct bch_fs *c;
+
+	mutex_lock(&bch_fs_list_lock);
+	c = __bch2_uuid_to_fs(uuid);
+	if (c)
+		closure_get(&c->cl);
+	mutex_unlock(&bch_fs_list_lock);
+
+	return c;
+}
+
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, u64s =
+		((sizeof(struct jset_entry_dev_usage) +
+		  sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+		sizeof(u64);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		nr++;
+	rcu_read_unlock();
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->dev_usage_journal_res, u64s * nr);
+}
+
+/* Filesystem RO/RW: */
+
+/*
+ * For startup/shutdown of RW stuff, the dependencies are:
+ *
+ * - foreground writes depend on copygc and rebalance (to free up space)
+ *
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
+ *   don't because they either reserve ahead of time or don't block if
+ *   allocations fail, but allocations can require mark and sweep gc to run
+ *   because of generation number wraparound)
+ *
+ * - all of the above depends on the allocator threads
+ *
+ * - allocator depends on the journal (when it rewrites prios and gens)
+ */
+
+static void __bch2_fs_read_only(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, clean_passes = 0;
+	u64 seq = 0;
+
+	bch2_fs_ec_stop(c);
+	bch2_open_buckets_stop(c, NULL, true);
+	bch2_rebalance_stop(c);
+	bch2_copygc_stop(c);
+	bch2_gc_thread_stop(c);
+	bch2_fs_ec_flush(c);
+
+	bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+		    journal_cur_seq(&c->journal));
+
+	do {
+		clean_passes++;
+
+		if (bch2_btree_interior_updates_flush(c) ||
+		    bch2_journal_flush_all_pins(&c->journal) ||
+		    bch2_btree_flush_all_writes(c) ||
+		    seq != atomic64_read(&c->journal.seq)) {
+			seq = atomic64_read(&c->journal.seq);
+			clean_passes = 0;
+		}
+	} while (clean_passes < 2);
+
+	bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+		    journal_cur_seq(&c->journal));
+
+	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+		set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+	bch2_fs_journal_stop(&c->journal);
+
+	/*
+	 * After stopping journal:
+	 */
+	for_each_member_device(ca, c, i)
+		bch2_dev_allocator_remove(c, ca);
+}
+
+#ifndef BCH_WRITE_REF_DEBUG
+static void bch2_writes_disabled(struct percpu_ref *writes)
+{
+	struct bch_fs *c = container_of(writes, struct bch_fs, writes);
+
+	set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	wake_up(&bch2_read_only_wait);
+}
+#endif
+
+void bch2_fs_read_only(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_RW, &c->flags)) {
+		bch2_journal_reclaim_stop(&c->journal);
+		return;
+	}
+
+	BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	/*
+	 * Block new foreground-end write operations from starting - any new
+	 * writes will return -EROFS:
+	 */
+	set_bit(BCH_FS_GOING_RO, &c->flags);
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_kill(&c->writes);
+#else
+	for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
+		bch2_write_ref_put(c, i);
+#endif
+
+	/*
+	 * If we're not doing an emergency shutdown, we want to wait on
+	 * outstanding writes to complete so they don't see spurious errors due
+	 * to shutting down the allocator:
+	 *
+	 * If we are doing an emergency shutdown outstanding writes may
+	 * hang until we shutdown the allocator so we don't want to wait
+	 * on outstanding writes before shutting everything down - but
+	 * we do need to wait on them before returning and signalling
+	 * that going RO is complete:
+	 */
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+		   test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+
+	__bch2_fs_read_only(c);
+
+	wait_event(bch2_read_only_wait,
+		   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+	clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+	clear_bit(BCH_FS_GOING_RO, &c->flags);
+
+	if (!bch2_journal_error(&c->journal) &&
+	    !test_bit(BCH_FS_ERROR, &c->flags) &&
+	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
+	    test_bit(BCH_FS_STARTED, &c->flags) &&
+	    test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+	    !c->opts.norecovery) {
+		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
+		BUG_ON(atomic_read(&c->btree_cache.dirty));
+		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
+		BUG_ON(c->btree_write_buffer.state.nr);
+
+		bch_verbose(c, "marking filesystem clean");
+		bch2_fs_mark_clean(c);
+	}
+
+	clear_bit(BCH_FS_RW, &c->flags);
+}
+
+static void bch2_fs_read_only_work(struct work_struct *work)
+{
+	struct bch_fs *c =
+		container_of(work, struct bch_fs, read_only_work);
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+}
+
+static void bch2_fs_read_only_async(struct bch_fs *c)
+{
+	queue_work(system_long_wq, &c->read_only_work);
+}
+
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
+{
+	bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+
+	bch2_journal_halt(&c->journal);
+	bch2_fs_read_only_async(c);
+
+	wake_up(&bch2_read_only_wait);
+	return ret;
+}
+
+static int bch2_fs_read_write_late(struct bch_fs *c)
+{
+	int ret;
+
+	/*
+	 * Data move operations can't run until after check_snapshots has
+	 * completed, and bch2_snapshot_is_ancestor() is available.
+	 *
+	 * Ideally we'd start copygc/rebalance earlier instead of waiting for
+	 * all of recovery/fsck to complete:
+	 */
+	ret = bch2_copygc_start(c);
+	if (ret) {
+		bch_err(c, "error starting copygc thread");
+		return ret;
+	}
+
+	ret = bch2_rebalance_start(c);
+	if (ret) {
+		bch_err(c, "error starting rebalance thread");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret;
+
+	if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+		bch_err(c, "cannot go rw, unfixed btree errors");
+		return -BCH_ERR_erofs_unfixed_errors;
+	}
+
+	if (test_bit(BCH_FS_RW, &c->flags))
+		return 0;
+
+	if (c->opts.norecovery)
+		return -BCH_ERR_erofs_norecovery;
+
+	/*
+	 * nochanges is used for fsck -n mode - we have to allow going rw
+	 * during recovery for that to work:
+	 */
+	if (c->opts.nochanges && (!early || c->opts.read_only))
+		return -BCH_ERR_erofs_nochanges;
+
+	bch_info(c, "going read-write");
+
+	ret = bch2_sb_members_v2_init(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_fs_mark_dirty(c);
+	if (ret)
+		goto err;
+
+	clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+
+	/*
+	 * First journal write must be a flush write: after a clean shutdown we
+	 * don't read the journal, so the first journal write may end up
+	 * overwriting whatever was there previously, and there must always be
+	 * at least one non-flush write in the journal or recovery will fail:
+	 */
+	set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	set_bit(BCH_FS_RW, &c->flags);
+	set_bit(BCH_FS_WAS_RW, &c->flags);
+
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_reinit(&c->writes);
+#else
+	for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+		BUG_ON(atomic_long_read(&c->writes[i]));
+		atomic_long_inc(&c->writes[i]);
+	}
+#endif
+
+	ret = bch2_gc_thread_start(c);
+	if (ret) {
+		bch_err(c, "error starting gc thread");
+		return ret;
+	}
+
+	ret = bch2_journal_reclaim_start(&c->journal);
+	if (ret)
+		goto err;
+
+	if (!early) {
+		ret = bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+	bch2_do_discards(c);
+	bch2_do_invalidates(c);
+	bch2_do_stripe_deletes(c);
+	bch2_do_pending_node_rewrites(c);
+	return 0;
+err:
+	if (test_bit(BCH_FS_RW, &c->flags))
+		bch2_fs_read_only(c);
+	else
+		__bch2_fs_read_only(c);
+	return ret;
+}
+
+int bch2_fs_read_write(struct bch_fs *c)
+{
+	return __bch2_fs_read_write(c, false);
+}
+
+int bch2_fs_read_write_early(struct bch_fs *c)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	return __bch2_fs_read_write(c, true);
+}
+
+/* Filesystem startup/shutdown: */
+
+static void __bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_exit(&c->times[i]);
+
+	bch2_free_pending_node_rewrites(c);
+	bch2_fs_sb_errors_exit(c);
+	bch2_fs_counters_exit(c);
+	bch2_fs_snapshots_exit(c);
+	bch2_fs_quota_exit(c);
+	bch2_fs_fs_io_direct_exit(c);
+	bch2_fs_fs_io_buffered_exit(c);
+	bch2_fs_fsio_exit(c);
+	bch2_fs_ec_exit(c);
+	bch2_fs_encryption_exit(c);
+	bch2_fs_nocow_locking_exit(c);
+	bch2_fs_io_write_exit(c);
+	bch2_fs_io_read_exit(c);
+	bch2_fs_buckets_waiting_for_journal_exit(c);
+	bch2_fs_btree_interior_update_exit(c);
+	bch2_fs_btree_iter_exit(c);
+	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
+	bch2_fs_btree_cache_exit(c);
+	bch2_fs_replicas_exit(c);
+	bch2_fs_journal_exit(&c->journal);
+	bch2_io_clock_exit(&c->io_clock[WRITE]);
+	bch2_io_clock_exit(&c->io_clock[READ]);
+	bch2_fs_compress_exit(c);
+	bch2_journal_keys_put_initial(c);
+	BUG_ON(atomic_read(&c->journal_keys.ref));
+	bch2_fs_btree_write_buffer_exit(c);
+	percpu_free_rwsem(&c->mark_lock);
+	free_percpu(c->online_reserved);
+
+	darray_exit(&c->btree_roots_extra);
+	free_percpu(c->pcpu);
+	mempool_exit(&c->large_bkey_pool);
+	mempool_exit(&c->btree_bounce_pool);
+	bioset_exit(&c->btree_bio);
+	mempool_exit(&c->fill_iter);
+#ifndef BCH_WRITE_REF_DEBUG
+	percpu_ref_exit(&c->writes);
+#endif
+	kfree(rcu_dereference_protected(c->disk_groups, 1));
+	kfree(c->journal_seq_blacklist_table);
+	kfree(c->unused_inode_hints);
+
+	if (c->write_ref_wq)
+		destroy_workqueue(c->write_ref_wq);
+	if (c->io_complete_wq)
+		destroy_workqueue(c->io_complete_wq);
+	if (c->copygc_wq)
+		destroy_workqueue(c->copygc_wq);
+	if (c->btree_io_complete_wq)
+		destroy_workqueue(c->btree_io_complete_wq);
+	if (c->btree_update_wq)
+		destroy_workqueue(c->btree_update_wq);
+
+	bch2_free_super(&c->disk_sb);
+	kvpfree(c, sizeof(*c));
+	module_put(THIS_MODULE);
+}
+
+static void bch2_fs_release(struct kobject *kobj)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	__bch2_fs_free(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	bch_verbose(c, "shutting down");
+
+	set_bit(BCH_FS_STOPPING, &c->flags);
+
+	cancel_work_sync(&c->journal_seq_blacklist_gc_work);
+
+	down_write(&c->state_lock);
+	bch2_fs_read_only(c);
+	up_write(&c->state_lock);
+
+	for_each_member_device(ca, c, i)
+		if (ca->kobj.state_in_sysfs &&
+		    ca->disk_sb.bdev)
+			sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (c->kobj.state_in_sysfs)
+		kobject_del(&c->kobj);
+
+	bch2_fs_debug_exit(c);
+	bch2_fs_chardev_exit(c);
+
+	kobject_put(&c->counters_kobj);
+	kobject_put(&c->time_stats);
+	kobject_put(&c->opts_dir);
+	kobject_put(&c->internal);
+
+	/* btree prefetch might have kicked off reads in the background: */
+	bch2_btree_flush_all_reads(c);
+
+	for_each_member_device(ca, c, i)
+		cancel_work_sync(&ca->io_error_work);
+
+	cancel_work_sync(&c->read_only_work);
+}
+
+void bch2_fs_free(struct bch_fs *c)
+{
+	unsigned i;
+
+	mutex_lock(&bch_fs_list_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_fs_list_lock);
+
+	closure_sync(&c->cl);
+	closure_debug_destroy(&c->cl);
+
+	for (i = 0; i < c->sb.nr_devices; i++) {
+		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+		if (ca) {
+			bch2_free_super(&ca->disk_sb);
+			bch2_dev_free(ca);
+		}
+	}
+
+	bch_verbose(c, "shutdown complete");
+
+	kobject_put(&c->kobj);
+}
+
+void bch2_fs_stop(struct bch_fs *c)
+{
+	__bch2_fs_stop(c);
+	bch2_fs_free(c);
+}
+
+static int bch2_fs_online(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+	int ret = 0;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	if (__bch2_uuid_to_fs(c->sb.uuid)) {
+		bch_err(c, "filesystem UUID already open");
+		return -EINVAL;
+	}
+
+	ret = bch2_fs_chardev_init(c);
+	if (ret) {
+		bch_err(c, "error creating character device");
+		return ret;
+	}
+
+	bch2_fs_debug_init(c);
+
+	ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+	    kobject_add(&c->internal, &c->kobj, "internal") ?:
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+	    kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
+	    bch2_opts_create_sysfs_files(&c->opts_dir);
+	if (ret) {
+		bch_err(c, "error creating sysfs objects");
+		return ret;
+	}
+
+	down_write(&c->state_lock);
+
+	for_each_member_device(ca, c, i) {
+		ret = bch2_dev_sysfs_online(c, ca);
+		if (ret) {
+			bch_err(c, "error creating sysfs objects");
+			percpu_ref_put(&ca->ref);
+			goto err;
+		}
+	}
+
+	BUG_ON(!list_empty(&c->list));
+	list_add(&c->list, &bch_fs_list);
+err:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+{
+	struct bch_fs *c;
+	struct printbuf name = PRINTBUF;
+	unsigned i, iter_size;
+	int ret = 0;
+
+	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	if (!c) {
+		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
+		goto out;
+	}
+
+	__module_get(THIS_MODULE);
+
+	closure_init(&c->cl, NULL);
+
+	c->kobj.kset = bcachefs_kset;
+	kobject_init(&c->kobj, &bch2_fs_ktype);
+	kobject_init(&c->internal, &bch2_fs_internal_ktype);
+	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
+	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+	kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
+
+	c->minor		= -1;
+	c->disk_sb.fs_sb	= true;
+
+	init_rwsem(&c->state_lock);
+	mutex_init(&c->sb_lock);
+	mutex_init(&c->replicas_gc_lock);
+	mutex_init(&c->btree_root_lock);
+	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+
+	init_rwsem(&c->gc_lock);
+	mutex_init(&c->gc_gens_lock);
+	atomic_set(&c->journal_keys.ref, 1);
+	c->journal_keys.initial_ref_held = true;
+
+	for (i = 0; i < BCH_TIME_STAT_NR; i++)
+		bch2_time_stats_init(&c->times[i]);
+
+	bch2_fs_copygc_init(c);
+	bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+	bch2_fs_btree_iter_init_early(c);
+	bch2_fs_btree_interior_update_init_early(c);
+	bch2_fs_allocator_background_init(c);
+	bch2_fs_allocator_foreground_init(c);
+	bch2_fs_rebalance_init(c);
+	bch2_fs_quota_init(c);
+	bch2_fs_ec_init_early(c);
+	bch2_fs_move_init(c);
+	bch2_fs_sb_errors_init_early(c);
+
+	INIT_LIST_HEAD(&c->list);
+
+	mutex_init(&c->usage_scratch_lock);
+
+	mutex_init(&c->bio_bounce_pages_lock);
+	mutex_init(&c->snapshot_table_lock);
+	init_rwsem(&c->snapshot_create_lock);
+
+	spin_lock_init(&c->btree_write_error_lock);
+
+	INIT_WORK(&c->journal_seq_blacklist_gc_work,
+		  bch2_blacklist_entries_gc);
+
+	INIT_LIST_HEAD(&c->journal_iters);
+
+	INIT_LIST_HEAD(&c->fsck_error_msgs);
+	mutex_init(&c->fsck_error_msgs_lock);
+
+	seqcount_init(&c->gc_pos_lock);
+
+	seqcount_init(&c->usage_lock);
+
+	sema_init(&c->io_in_flight, 128);
+
+	INIT_LIST_HEAD(&c->vfs_inodes_list);
+	mutex_init(&c->vfs_inodes_lock);
+
+	c->copy_gc_enabled		= 1;
+	c->rebalance.enabled		= 1;
+	c->promote_whole_extents	= true;
+
+	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
+	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
+	c->journal.blocked_time		= &c->times[BCH_TIME_blocked_journal];
+	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
+
+	bch2_fs_btree_cache_init_early(&c->btree_cache);
+
+	mutex_init(&c->sectors_available_lock);
+
+	ret = percpu_init_rwsem(&c->mark_lock);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	ret = bch2_sb_to_fs(c, sb);
+	mutex_unlock(&c->sb_lock);
+
+	if (ret)
+		goto err;
+
+	pr_uuid(&name, c->sb.user_uuid.b);
+	strscpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
+	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+	if (ret)
+		goto err;
+
+	/* Compat: */
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
+
+	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
+	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
+
+	c->opts = bch2_opts_default;
+	ret = bch2_opts_from_sb(&c->opts, sb);
+	if (ret)
+		goto err;
+
+	bch2_opts_apply(&c->opts, opts);
+
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+	if (c->opts.inodes_use_key_cache)
+		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+	c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
+
+	c->block_bits		= ilog2(block_sectors(c));
+	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
+
+	if (bch2_fs_init_fault("fs_alloc")) {
+		bch_err(c, "fs_alloc fault injected");
+		ret = -EFAULT;
+		goto err;
+	}
+
+	iter_size = sizeof(struct sort_iter) +
+		(btree_blocks(c) + 1) * 2 *
+		sizeof(struct sort_iter_set);
+
+	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
+	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+				WQ_FREEZABLE, 0)) ||
+#ifndef BCH_WRITE_REF_DEBUG
+	    percpu_ref_init(&c->writes, bch2_writes_disabled,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+#endif
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->btree_bio, 1,
+			max(offsetof(struct btree_read_bio, bio),
+			    offsetof(struct btree_write_bio, wbio.bio)),
+			BIOSET_NEED_BVECS) ||
+	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+	    !(c->online_reserved = alloc_percpu(u64)) ||
+	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+					btree_bytes(c)) ||
+	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+					      sizeof(u64), GFP_KERNEL))) {
+		ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+		goto err;
+	}
+
+	ret = bch2_fs_counters_init(c) ?:
+	    bch2_fs_sb_errors_init(c) ?:
+	    bch2_io_clock_init(&c->io_clock[READ]) ?:
+	    bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+	    bch2_fs_journal_init(&c->journal) ?:
+	    bch2_fs_replicas_init(c) ?:
+	    bch2_fs_btree_cache_init(c) ?:
+	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+	    bch2_fs_btree_iter_init(c) ?:
+	    bch2_fs_btree_interior_update_init(c) ?:
+	    bch2_fs_buckets_waiting_for_journal_init(c) ?:
+	    bch2_fs_btree_write_buffer_init(c) ?:
+	    bch2_fs_subvolumes_init(c) ?:
+	    bch2_fs_io_read_init(c) ?:
+	    bch2_fs_io_write_init(c) ?:
+	    bch2_fs_nocow_locking_init(c) ?:
+	    bch2_fs_encryption_init(c) ?:
+	    bch2_fs_compress_init(c) ?:
+	    bch2_fs_ec_init(c) ?:
+	    bch2_fs_fsio_init(c) ?:
+	    bch2_fs_fs_io_buffered_init(c) ?:
+	    bch2_fs_fs_io_direct_init(c);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < c->sb.nr_devices; i++)
+		if (bch2_dev_exists(c->disk_sb.sb, i) &&
+		    bch2_dev_alloc(c, i)) {
+			ret = -EEXIST;
+			goto err;
+		}
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->btree_root_journal_res,
+			BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+	bch2_dev_usage_journal_reserve(c);
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->clock_journal_res,
+			(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
+	mutex_lock(&bch_fs_list_lock);
+	ret = bch2_fs_online(c);
+	mutex_unlock(&bch_fs_list_lock);
+
+	if (ret)
+		goto err;
+out:
+	return c;
+err:
+	bch2_fs_free(c);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+noinline_for_stack
+static void print_mount_opts(struct bch_fs *c)
+{
+	enum bch_opt_id i;
+	struct printbuf p = PRINTBUF;
+	bool first = true;
+
+	prt_str(&p, "mounting version ");
+	bch2_version_to_text(&p, c->sb.version);
+
+	if (c->opts.read_only) {
+		prt_str(&p, " opts=");
+		first = false;
+		prt_printf(&p, "ro");
+	}
+
+	for (i = 0; i < bch2_opts_nr; i++) {
+		const struct bch_option *opt = &bch2_opt_table[i];
+		u64 v = bch2_opt_get_by_id(&c->opts, i);
+
+		if (!(opt->flags & OPT_MOUNT))
+			continue;
+
+		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+			continue;
+
+		prt_str(&p, first ? " opts=" : ",");
+		first = false;
+		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
+	}
+
+	bch_info(c, "%s", p.buf);
+	printbuf_exit(&p);
+}
+
+int bch2_fs_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	time64_t now = ktime_get_real_seconds();
+	unsigned i;
+	int ret;
+
+	print_mount_opts(c);
+
+	down_write(&c->state_lock);
+
+	BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+
+	mutex_lock(&c->sb_lock);
+
+	ret = bch2_sb_members_v2_init(c);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		goto err;
+	}
+
+	for_each_online_member(ca, c, i)
+		bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+
+	mutex_unlock(&c->sb_lock);
+
+	for_each_rw_member(ca, c, i)
+		bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+
+	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+		? bch2_fs_recovery(c)
+		: bch2_fs_initialize(c);
+	if (ret)
+		goto err;
+
+	ret = bch2_opts_check_may_set(c);
+	if (ret)
+		goto err;
+
+	if (bch2_fs_init_fault("fs_start")) {
+		bch_err(c, "fs_start fault injected");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	set_bit(BCH_FS_STARTED, &c->flags);
+
+	if (c->opts.read_only || c->opts.nochanges) {
+		bch2_fs_read_only(c);
+	} else {
+		ret = !test_bit(BCH_FS_RW, &c->flags)
+			? bch2_fs_read_write(c)
+			: bch2_fs_read_write_late(c);
+		if (ret)
+			goto err;
+	}
+
+	ret = 0;
+out:
+	up_write(&c->state_lock);
+	return ret;
+err:
+	bch_err_msg(c, ret, "starting filesystem");
+	goto out;
+}
+
+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
+{
+	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
+
+	if (le16_to_cpu(sb->block_size) != block_sectors(c))
+		return -BCH_ERR_mismatched_block_size;
+
+	if (le16_to_cpu(m.bucket_size) <
+	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
+		return -BCH_ERR_bucket_size_too_small;
+
+	return 0;
+}
+
+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+{
+	struct bch_sb *newest =
+		le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+
+	if (!uuid_equal(&fs->uuid, &sb->uuid))
+		return -BCH_ERR_device_not_a_member_of_filesystem;
+
+	if (!bch2_dev_exists(newest, sb->dev_idx))
+		return -BCH_ERR_device_has_been_removed;
+
+	if (fs->block_size != sb->block_size)
+		return -BCH_ERR_mismatched_block_size;
+
+	return 0;
+}
+
+/* Device startup/shutdown: */
+
+static void bch2_dev_release(struct kobject *kobj)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+
+	kfree(ca);
+}
+
+static void bch2_dev_free(struct bch_dev *ca)
+{
+	cancel_work_sync(&ca->io_error_work);
+
+	if (ca->kobj.state_in_sysfs &&
+	    ca->disk_sb.bdev)
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+
+	if (ca->kobj.state_in_sysfs)
+		kobject_del(&ca->kobj);
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+
+	free_percpu(ca->io_done);
+	bioset_exit(&ca->replica_set);
+	bch2_dev_buckets_free(ca);
+	free_page((unsigned long) ca->sb_read_scratch);
+
+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
+	percpu_ref_exit(&ca->io_ref);
+	percpu_ref_exit(&ca->ref);
+	kobject_put(&ca->kobj);
+}
+
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
+{
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (percpu_ref_is_zero(&ca->io_ref))
+		return;
+
+	__bch2_dev_read_only(c, ca);
+
+	reinit_completion(&ca->io_ref_completion);
+	percpu_ref_kill(&ca->io_ref);
+	wait_for_completion(&ca->io_ref_completion);
+
+	if (ca->kobj.state_in_sysfs) {
+		sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+		sysfs_remove_link(&ca->kobj, "block");
+	}
+
+	bch2_free_super(&ca->disk_sb);
+	bch2_dev_journal_exit(ca);
+}
+
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
+
+	complete(&ca->ref_completion);
+}
+
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+{
+	struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+
+	complete(&ca->io_ref_completion);
+}
+
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
+{
+	int ret;
+
+	if (!c->kobj.state_in_sysfs)
+		return 0;
+
+	if (!ca->kobj.state_in_sysfs) {
+		ret = kobject_add(&ca->kobj, &c->kobj,
+				  "dev-%u", ca->dev_idx);
+		if (ret)
+			return ret;
+	}
+
+	if (ca->disk_sb.bdev) {
+		struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
+
+		ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
+		if (ret)
+			return ret;
+
+		ret = sysfs_create_link(&ca->kobj, block, "block");
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+					struct bch_member *member)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+	if (!ca)
+		return NULL;
+
+	kobject_init(&ca->kobj, &bch2_dev_ktype);
+	init_completion(&ca->ref_completion);
+	init_completion(&ca->io_ref_completion);
+
+	init_rwsem(&ca->bucket_lock);
+
+	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
+
+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
+	ca->mi = bch2_mi_to_cpu(member);
+
+	for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+		atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
+	ca->uuid = member->uuid;
+
+	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / btree_sectors(c));
+
+	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
+			    0, GFP_KERNEL) ||
+	    percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+			    PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+	    !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
+	    bch2_dev_buckets_alloc(c, ca) ||
+	    bioset_init(&ca->replica_set, 4,
+			offsetof(struct bch_write_bio, bio), 0) ||
+	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
+		goto err;
+
+	return ca;
+err:
+	bch2_dev_free(ca);
+	return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+			    unsigned dev_idx)
+{
+	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+	ca->fs = c;
+	rcu_assign_pointer(c->devs[ca->dev_idx], ca);
+
+	if (bch2_dev_sysfs_online(c, ca))
+		pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+	struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+	struct bch_dev *ca = NULL;
+	int ret = 0;
+
+	if (bch2_fs_init_fault("dev_alloc"))
+		goto err;
+
+	ca = __bch2_dev_alloc(c, &member);
+	if (!ca)
+		goto err;
+
+	ca->fs = c;
+
+	bch2_dev_attach(c, ca, dev_idx);
+	return ret;
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	return -BCH_ERR_ENOMEM_dev_alloc;
+}
+
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
+{
+	unsigned ret;
+
+	if (bch2_dev_is_online(ca)) {
+		bch_err(ca, "already have device online in slot %u",
+			sb->sb->dev_idx);
+		return -BCH_ERR_device_already_online;
+	}
+
+	if (get_capacity(sb->bdev->bd_disk) <
+	    ca->mi.bucket_size * ca->mi.nbuckets) {
+		bch_err(ca, "cannot online: device too small");
+		return -BCH_ERR_device_size_too_small;
+	}
+
+	BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
+	ret = bch2_dev_journal_init(ca, sb->sb);
+	if (ret)
+		return ret;
+
+	/* Commit: */
+	ca->disk_sb = *sb;
+	memset(sb, 0, sizeof(*sb));
+
+	ca->dev = ca->disk_sb.bdev->bd_dev;
+
+	percpu_ref_reinit(&ca->io_ref);
+
+	return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb.sb->seq))
+		bch2_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+	ret = __bch2_dev_attach_bdev(ca, sb);
+	if (ret)
+		return ret;
+
+	bch2_dev_sysfs_online(c, ca);
+
+	if (c->sb.nr_devices == 1)
+		snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
+	snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+
+	rebalance_wakeup(c);
+	return 0;
+}
+
+/* Device management: */
+
+/*
+ * Note: this function is also used by the error paths - when a particular
+ * device sees an error, we call it to determine whether we can just set the
+ * device RO, or - if this function returns false - we'll set the whole
+ * filesystem RO:
+ *
+ * XXX: maybe we should be more explicit about whether we're changing state
+ * because we got an error or what have you?
+ */
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
+			    enum bch_member_state new_state, int flags)
+{
+	struct bch_devs_mask new_online_devs;
+	struct bch_dev *ca2;
+	int i, nr_rw = 0, required;
+
+	lockdep_assert_held(&c->state_lock);
+
+	switch (new_state) {
+	case BCH_MEMBER_STATE_rw:
+		return true;
+	case BCH_MEMBER_STATE_ro:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw)
+			return true;
+
+		/* do we have enough devices to write to?  */
+		for_each_member_device(ca2, c, i)
+			if (ca2 != ca)
+				nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
+
+		required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
+			       ? c->opts.metadata_replicas
+			       : c->opts.metadata_replicas_required,
+			       !(flags & BCH_FORCE_IF_DATA_DEGRADED)
+			       ? c->opts.data_replicas
+			       : c->opts.data_replicas_required);
+
+		return nr_rw >= required;
+	case BCH_MEMBER_STATE_failed:
+	case BCH_MEMBER_STATE_spare:
+		if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+		    ca->mi.state != BCH_MEMBER_STATE_ro)
+			return true;
+
+		/* do we have enough devices to read from?  */
+		new_online_devs = bch2_online_devs(c);
+		__clear_bit(ca->dev_idx, new_online_devs.d);
+
+		return bch2_have_enough_devs(c, new_online_devs, flags, false);
+	default:
+		BUG();
+	}
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, flags = 0;
+
+	if (c->opts.very_degraded)
+		flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+	if (c->opts.degraded)
+		flags |= BCH_FORCE_IF_DEGRADED;
+
+	if (!c->opts.degraded &&
+	    !c->opts.very_degraded) {
+		mutex_lock(&c->sb_lock);
+
+		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb.sb, i))
+				continue;
+
+			ca = bch_dev_locked(c, i);
+
+			if (!bch2_dev_is_online(ca) &&
+			    (ca->mi.state == BCH_MEMBER_STATE_rw ||
+			     ca->mi.state == BCH_MEMBER_STATE_ro)) {
+				mutex_unlock(&c->sb_lock);
+				return false;
+			}
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
+}
+
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
+{
+	/*
+	 * The allocator thread itself allocates btree nodes, so stop it first:
+	 */
+	bch2_dev_allocator_remove(c, ca);
+	bch2_dev_journal_stop(&c->journal, ca);
+}
+
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+{
+	lockdep_assert_held(&c->state_lock);
+
+	BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
+
+	bch2_dev_allocator_add(c, ca);
+	bch2_recalc_capacity(c);
+}
+
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+			 enum bch_member_state new_state, int flags)
+{
+	struct bch_member *m;
+	int ret = 0;
+
+	if (ca->mi.state == new_state)
+		return 0;
+
+	if (!bch2_dev_state_allowed(c, ca, new_state, flags))
+		return -BCH_ERR_device_state_not_allowed;
+
+	if (new_state != BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_only(c, ca);
+
+	bch_notice(ca, "%s", bch2_member_states[new_state]);
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	SET_BCH_MEMBER_STATE(m, new_state);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (new_state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	rebalance_wakeup(c);
+
+	return ret;
+}
+
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
+		       enum bch_member_state new_state, int flags)
+{
+	int ret;
+
+	down_write(&c->state_lock);
+	ret = __bch2_dev_set_state(c, ca, new_state, flags);
+	up_write(&c->state_lock);
+
+	return ret;
+}
+
+/* Device add/removal: */
+
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bpos start	= POS(ca->dev_idx, 0);
+	struct bpos end		= POS(ca->dev_idx, U64_MAX);
+	int ret;
+
+	/*
+	 * We clear the LRU and need_discard btrees first so that we don't race
+	 * with bch2_do_invalidates() and bch2_do_discards()
+	 */
+	ret =   bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+					BTREE_TRIGGER_NORUN, NULL) ?:
+		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+					BTREE_TRIGGER_NORUN, NULL);
+	if (ret)
+		bch_err_msg(c, ret, "removing dev alloc info");
+
+	return ret;
+}
+
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	struct bch_member *m;
+	unsigned dev_idx = ca->dev_idx, data;
+	int ret;
+
+	down_write(&c->state_lock);
+
+	/*
+	 * We consume a reference to ca->ref, regardless of whether we succeed
+	 * or fail:
+	 */
+	percpu_ref_put(&ca->ref);
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+		bch_err(ca, "Cannot remove without losing data");
+		ret = -BCH_ERR_device_state_not_allowed;
+		goto err;
+	}
+
+	__bch2_dev_read_only(c, ca);
+
+	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
+	if (ret) {
+		bch_err_msg(ca, ret, "dropping data");
+		goto err;
+	}
+
+	ret = bch2_dev_remove_alloc(c, ca);
+	if (ret) {
+		bch_err_msg(ca, ret, "deleting alloc info");
+		goto err;
+	}
+
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
+	if (ret) {
+		bch_err_msg(ca, ret, "flushing journal");
+		goto err;
+	}
+
+	ret = bch2_journal_flush(&c->journal);
+	if (ret) {
+		bch_err(ca, "journal error");
+		goto err;
+	}
+
+	ret = bch2_replicas_gc2(c);
+	if (ret) {
+		bch_err_msg(ca, ret, "in replicas_gc2()");
+		goto err;
+	}
+
+	data = bch2_dev_has_data(c, ca);
+	if (data) {
+		struct printbuf data_has = PRINTBUF;
+
+		prt_bitflags(&data_has, bch2_data_types, data);
+		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
+		printbuf_exit(&data_has);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	mutex_lock(&c->sb_lock);
+	rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+	mutex_unlock(&c->sb_lock);
+
+	percpu_ref_kill(&ca->ref);
+	wait_for_completion(&ca->ref_completion);
+
+	bch2_dev_free(ca);
+
+	/*
+	 * At this point the device object has been removed in-core, but the
+	 * on-disk journal might still refer to the device index via sb device
+	 * usage entries. Recovery fails if it sees usage information for an
+	 * invalid device. Flush journal pins to push the back of the journal
+	 * past now invalid device index references before we update the
+	 * superblock, but after the device object has been removed so any
+	 * further journal writes elide usage info for the device.
+	 */
+	bch2_journal_flush_all_pins(&c->journal);
+
+	/*
+	 * Free this device's slot in the bch_member array - all pointers to
+	 * this device must be gone:
+	 */
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+	memset(&m->uuid, 0, sizeof(m->uuid));
+
+	bch2_write_super(c);
+
+	mutex_unlock(&c->sb_lock);
+	up_write(&c->state_lock);
+
+	bch2_dev_usage_journal_reserve(c);
+	return 0;
+err:
+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+	    !percpu_ref_is_zero(&ca->io_ref))
+		__bch2_dev_read_write(c, ca);
+	up_write(&c->state_lock);
+	return ret;
+}
+
+/* Add new device to running filesystem: */
+int bch2_dev_add(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb;
+	struct bch_dev *ca = NULL;
+	struct bch_sb_field_members_v2 *mi;
+	struct bch_member dev_mi;
+	unsigned dev_idx, nr_devices, u64s;
+	struct printbuf errbuf = PRINTBUF;
+	struct printbuf label = PRINTBUF;
+	int ret;
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		bch_err_msg(c, ret, "reading super");
+		goto err;
+	}
+
+	dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+		if (label.allocation_failure) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	ret = bch2_dev_may_add(sb.sb, c);
+	if (ret) {
+		bch_err_fn(c, ret);
+		goto err;
+	}
+
+	ca = __bch2_dev_alloc(c, &dev_mi);
+	if (!ca) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	bch2_dev_usage_init(ca);
+
+	ret = __bch2_dev_attach_bdev(ca, &sb);
+	if (ret)
+		goto err;
+
+	ret = bch2_dev_journal_alloc(ca);
+	if (ret) {
+		bch_err_msg(c, ret, "allocating journal");
+		goto err;
+	}
+
+	down_write(&c->state_lock);
+	mutex_lock(&c->sb_lock);
+
+	ret = bch2_sb_from_fs(c, ca);
+	if (ret) {
+		bch_err_msg(c, ret, "setting up new superblock");
+		goto err_unlock;
+	}
+
+	if (dynamic_fault("bcachefs:add:no_slot"))
+		goto no_slot;
+
+	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+		if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
+			goto have_slot;
+no_slot:
+	ret = -BCH_ERR_ENOSPC_sb_members;
+	bch_err_msg(c, ret, "setting up new superblock");
+	goto err_unlock;
+
+have_slot:
+	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+	mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+			    le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+	mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+	if (!mi) {
+		ret = -BCH_ERR_ENOSPC_sb_members;
+		bch_err_msg(c, ret, "setting up new superblock");
+		goto err_unlock;
+	}
+	struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+
+	/* success: */
+
+	*m = dev_mi;
+	m->last_mount = cpu_to_le64(ktime_get_real_seconds());
+	c->disk_sb.sb->nr_devices	= nr_devices;
+
+	ca->disk_sb.sb->dev_idx	= dev_idx;
+	bch2_dev_attach(c, ca, dev_idx);
+
+	if (BCH_MEMBER_GROUP(&dev_mi)) {
+		ret = __bch2_dev_group_set(c, ca, label.buf);
+		if (ret) {
+			bch_err_msg(c, ret, "creating new label");
+			goto err_unlock;
+		}
+	}
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	bch2_dev_usage_journal_reserve(c);
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		bch_err_msg(ca, ret, "marking new superblock");
+		goto err_late;
+	}
+
+	ret = bch2_fs_freespace_init(c);
+	if (ret) {
+		bch_err_msg(ca, ret, "initializing free space");
+		goto err_late;
+	}
+
+	ca->new_fs_bucket_idx = 0;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	up_write(&c->state_lock);
+	return 0;
+
+err_unlock:
+	mutex_unlock(&c->sb_lock);
+	up_write(&c->state_lock);
+err:
+	if (ca)
+		bch2_dev_free(ca);
+	bch2_free_super(&sb);
+	printbuf_exit(&label);
+	printbuf_exit(&errbuf);
+	return ret;
+err_late:
+	up_write(&c->state_lock);
+	ca = NULL;
+	goto err;
+}
+
+/* Hot add existing device to running filesystem: */
+int bch2_dev_online(struct bch_fs *c, const char *path)
+{
+	struct bch_opts opts = bch2_opts_empty();
+	struct bch_sb_handle sb = { NULL };
+	struct bch_dev *ca;
+	unsigned dev_idx;
+	int ret;
+
+	down_write(&c->state_lock);
+
+	ret = bch2_read_super(path, &opts, &sb);
+	if (ret) {
+		up_write(&c->state_lock);
+		return ret;
+	}
+
+	dev_idx = sb.sb->dev_idx;
+
+	ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+	if (ret) {
+		bch_err_msg(c, ret, "bringing %s online", path);
+		goto err;
+	}
+
+	ret = bch2_dev_attach_bdev(c, &sb);
+	if (ret)
+		goto err;
+
+	ca = bch_dev_locked(c, dev_idx);
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret) {
+		bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+		goto err;
+	}
+
+	if (ca->mi.state == BCH_MEMBER_STATE_rw)
+		__bch2_dev_read_write(c, ca);
+
+	if (!ca->mi.freespace_initialized) {
+		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+		bch_err_msg(ca, ret, "initializing free space");
+		if (ret)
+			goto err;
+	}
+
+	if (!ca->journal.nr) {
+		ret = bch2_dev_journal_alloc(ca);
+		bch_err_msg(ca, ret, "allocating journal");
+		if (ret)
+			goto err;
+	}
+
+	mutex_lock(&c->sb_lock);
+	bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+		cpu_to_le64(ktime_get_real_seconds());
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	up_write(&c->state_lock);
+	return 0;
+err:
+	up_write(&c->state_lock);
+	bch2_free_super(&sb);
+	return ret;
+}
+
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+	down_write(&c->state_lock);
+
+	if (!bch2_dev_is_online(ca)) {
+		bch_err(ca, "Already offline");
+		up_write(&c->state_lock);
+		return 0;
+	}
+
+	if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
+		bch_err(ca, "Cannot offline required disk");
+		up_write(&c->state_lock);
+		return -BCH_ERR_device_state_not_allowed;
+	}
+
+	__bch2_dev_offline(c, ca);
+
+	up_write(&c->state_lock);
+	return 0;
+}
+
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bch_member *m;
+	u64 old_nbuckets;
+	int ret = 0;
+
+	down_write(&c->state_lock);
+	old_nbuckets = ca->mi.nbuckets;
+
+	if (nbuckets < ca->mi.nbuckets) {
+		bch_err(ca, "Cannot shrink yet");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (bch2_dev_is_online(ca) &&
+	    get_capacity(ca->disk_sb.bdev->bd_disk) <
+	    ca->mi.bucket_size * nbuckets) {
+		bch_err(ca, "New size larger than device");
+		ret = -BCH_ERR_device_size_too_small;
+		goto err;
+	}
+
+	ret = bch2_dev_buckets_resize(c, ca, nbuckets);
+	if (ret) {
+		bch_err_msg(ca, ret, "resizing buckets");
+		goto err;
+	}
+
+	ret = bch2_trans_mark_dev_sb(c, ca);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+	m->nbuckets = cpu_to_le64(nbuckets);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	if (ca->mi.freespace_initialized) {
+		ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+		if (ret)
+			goto err;
+
+		/*
+		 * XXX: this is all wrong transactionally - we'll be able to do
+		 * this correctly after the disk space accounting rewrite
+		 */
+		ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+	}
+
+	bch2_recalc_capacity(c);
+err:
+	up_write(&c->state_lock);
+	return ret;
+}
+
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		if (!strcmp(name, ca->name))
+			goto found;
+	ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
+found:
+	rcu_read_unlock();
+
+	return ca;
+}
+
+/* Filesystem open: */
+
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
+			    struct bch_opts opts)
+{
+	DARRAY(struct bch_sb_handle) sbs = { 0 };
+	struct bch_fs *c = NULL;
+	struct bch_sb_handle *sb, *best = NULL;
+	struct printbuf errbuf = PRINTBUF;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-ENODEV);
+
+	if (!nr_devices) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = darray_make_room(&sbs, nr_devices);
+	if (ret)
+		goto err;
+
+	for (unsigned i = 0; i < nr_devices; i++) {
+		struct bch_sb_handle sb = { NULL };
+
+		ret = bch2_read_super(devices[i], &opts, &sb);
+		if (ret)
+			goto err;
+
+		BUG_ON(darray_push(&sbs, sb));
+	}
+
+	darray_for_each(sbs, sb)
+		if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+			best = sb;
+
+	darray_for_each_reverse(sbs, sb) {
+		if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
+			pr_info("%pg has been removed, skipping", sb->bdev);
+			bch2_free_super(sb);
+			darray_remove_item(&sbs, sb);
+			best -= best > sb;
+			continue;
+		}
+
+		ret = bch2_dev_in_fs(best->sb, sb->sb);
+		if (ret)
+			goto err_print;
+	}
+
+	c = bch2_fs_alloc(best->sb, opts);
+	ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		goto err;
+
+	down_write(&c->state_lock);
+	darray_for_each(sbs, sb) {
+		ret = bch2_dev_attach_bdev(c, sb);
+		if (ret) {
+			up_write(&c->state_lock);
+			goto err;
+		}
+	}
+	up_write(&c->state_lock);
+
+	if (!bch2_fs_may_start(c)) {
+		ret = -BCH_ERR_insufficient_devices_to_start;
+		goto err_print;
+	}
+
+	if (!c->opts.nostart) {
+		ret = bch2_fs_start(c);
+		if (ret)
+			goto err;
+	}
+out:
+	darray_for_each(sbs, sb)
+		bch2_free_super(sb);
+	darray_exit(&sbs);
+	printbuf_exit(&errbuf);
+	module_put(THIS_MODULE);
+	return c;
+err_print:
+	pr_err("bch_fs_open err opening %s: %s",
+	       devices[0], bch2_err_str(ret));
+err:
+	if (!IS_ERR_OR_NULL(c))
+		bch2_fs_stop(c);
+	c = ERR_PTR(ret);
+	goto out;
+}
+
+/* Global interfaces/init */
+
+static void bcachefs_exit(void)
+{
+	bch2_debug_exit();
+	bch2_vfs_exit();
+	bch2_chardev_exit();
+	bch2_btree_key_cache_exit();
+	if (bcachefs_kset)
+		kset_unregister(bcachefs_kset);
+}
+
+static int __init bcachefs_init(void)
+{
+	bch2_bkey_pack_test();
+
+	if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+	    bch2_btree_key_cache_init() ||
+	    bch2_chardev_init() ||
+	    bch2_vfs_init() ||
+	    bch2_debug_init())
+		goto err;
+
+	return 0;
+err:
+	bcachefs_exit();
+	return -ENOMEM;
+}
+
+#define BCH_DEBUG_PARAM(name, description)			\
+	bool bch2_##name;					\
+	module_param_named(name, bch2_##name, bool, 0644);	\
+	MODULE_PARM_DESC(name, description);
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+__maybe_unused
+static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
+module_param_named(version, bch2_metadata_version, uint, 0400);
+
+module_exit(bcachefs_exit);
+module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
new file mode 100644
index 000000000000..bf762df18012
--- /dev/null
+++ b/fs/bcachefs/super.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_H
+#define _BCACHEFS_SUPER_H
+
+#include "extents.h"
+
+#include "bcachefs_ioctl.h"
+
+#include <linux/math64.h>
+
+struct bch_fs *bch2_dev_to_fs(dev_t);
+struct bch_fs *bch2_uuid_to_fs(__uuid_t);
+
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
+			   enum bch_member_state, int);
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+			enum bch_member_state, int);
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
+		      enum bch_member_state, int);
+
+int bch2_dev_fail(struct bch_dev *, int);
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_add(struct bch_fs *, const char *);
+int bch2_dev_online(struct bch_fs *, const char *);
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+
+bool bch2_fs_emergency_read_only(struct bch_fs *);
+void bch2_fs_read_only(struct bch_fs *);
+
+int bch2_fs_read_write(struct bch_fs *);
+int bch2_fs_read_write_early(struct bch_fs *);
+
+/*
+ * Only for use in the recovery/fsck path:
+ */
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
+{
+	if (!test_bit(BCH_FS_RW, &c->flags) &&
+	    !test_bit(BCH_FS_WAS_RW, &c->flags))
+		bch2_fs_read_write_early(c);
+}
+
+void __bch2_fs_stop(struct bch_fs *);
+void bch2_fs_free(struct bch_fs *);
+void bch2_fs_stop(struct bch_fs *);
+
+int bch2_fs_start(struct bch_fs *);
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+
+#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
new file mode 100644
index 000000000000..9c1fd4ca2b10
--- /dev/null
+++ b/fs/bcachefs/super_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUPER_TYPES_H
+#define _BCACHEFS_SUPER_TYPES_H
+
+struct bch_sb_handle {
+	struct bch_sb		*sb;
+	struct block_device	*bdev;
+	char			*sb_name;
+	struct bio		*bio;
+	void			*holder;
+	size_t			buffer_size;
+	blk_mode_t		mode;
+	unsigned		have_layout:1;
+	unsigned		have_bio:1;
+	unsigned		fs_sb:1;
+	u64			seq;
+};
+
+struct bch_devs_mask {
+	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_BKEY_PTRS_MAX];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u16			group;
+	u8			state;
+	u8			discard;
+	u8			data_allowed;
+	u8			durability;
+	u8			freespace_initialized;
+	u8			valid;
+};
+
+#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
new file mode 100644
index 000000000000..f3cb7115b530
--- /dev/null
+++ b/fs/bcachefs/sysfs.c
@@ -0,0 +1,1034 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#ifndef NO_BCACHEFS_SYSFS
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "sysfs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "inode.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "nocow_locking.h"
+#include "opts.h"
+#include "rebalance.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "tests.h"
+
+#include <linux/blkdev.h>
+#include <linux/sort.h>
+#include <linux/sched/clock.h>
+
+#include "util.h"
+
+#define SYSFS_OPS(type)							\
+const struct sysfs_ops type ## _sysfs_ops = {				\
+	.show	= type ## _show,					\
+	.store	= type ## _store					\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _to_text(struct printbuf *,			\
+			      struct kobject *, struct attribute *);	\
+									\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+{									\
+	struct printbuf out = PRINTBUF;					\
+	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
+									\
+	if (out.pos && out.buf[out.pos - 1] != '\n')			\
+		prt_newline(&out);					\
+									\
+	if (!ret && out.allocation_failure)				\
+		ret = -ENOMEM;						\
+									\
+	if (!ret) {							\
+		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
+		memcpy(buf, out.buf, ret);				\
+	}								\
+	printbuf_exit(&out);						\
+	return bch2_err_class(ret);					\
+}									\
+									\
+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
+			      struct attribute *attr)
+
+#define STORE(fn)							\
+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\
+			    const char *, size_t);			\
+									\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+{									\
+	return bch2_err_class(fn##_store_inner(kobj, attr, buf, size));	\
+}									\
+									\
+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\
+				  const char *buf, size_t size)
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, 0200)
+#define read_attribute(n)	__sysfs_attribute(n, 0444)
+#define rw_attribute(n)		__sysfs_attribute(n, 0644)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		prt_printf(out, fmt "\n", __VA_ARGS__);			\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		snprint(out, var);					\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		prt_human_readable_s64(out, val);			\
+} while (0)
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+write_attribute(trigger_gc);
+write_attribute(trigger_discards);
+write_attribute(trigger_invalidates);
+write_attribute(prune_cache);
+write_attribute(btree_wakeup);
+rw_attribute(btree_gc_periodic);
+rw_attribute(gc_gens_pos);
+
+read_attribute(uuid);
+read_attribute(minor);
+read_attribute(bucket_size);
+read_attribute(first_bucket);
+read_attribute(nbuckets);
+rw_attribute(durability);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(btree_write_stats);
+
+read_attribute(btree_cache_size);
+read_attribute(compression_stats);
+read_attribute(journal_debug);
+read_attribute(btree_updates);
+read_attribute(btree_cache);
+read_attribute(btree_key_cache);
+read_attribute(stripes_heap);
+read_attribute(open_buckets);
+read_attribute(open_buckets_partial);
+read_attribute(write_points);
+read_attribute(nocow_lock_table);
+
+#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(write_refs);
+
+static const char * const bch2_write_refs[] = {
+#define x(n)	#n,
+	BCH_WRITE_REFS()
+#undef x
+	NULL
+};
+
+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	bch2_printbuf_tabstop_push(out, 24);
+
+	for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
+		prt_str(out, bch2_write_refs[i]);
+		prt_tab(out);
+		prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
+		prt_newline(out);
+	}
+}
+#endif
+
+read_attribute(internal_uuid);
+read_attribute(disk_groups);
+
+read_attribute(has_data);
+read_attribute(alloc_debug);
+
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
+
+rw_attribute(discard);
+rw_attribute(label);
+
+rw_attribute(copy_gc_enabled);
+read_attribute(copy_gc_wait);
+
+rw_attribute(rebalance_enabled);
+sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_status);
+rw_attribute(promote_whole_extents);
+
+read_attribute(new_stripes);
+
+read_attribute(io_timers_read);
+read_attribute(io_timers_write);
+
+read_attribute(moving_ctxts);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#define x(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = 0444 };
+	BCH_TIME_STATS()
+#undef x
+
+static struct attribute sysfs_state_rw = {
+	.name = "state",
+	.mode =  0444,
+};
+
+static size_t bch2_btree_cache_size(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct btree *b;
+
+	mutex_lock(&c->btree_cache.lock);
+	list_for_each_entry(b, &c->btree_cache.live, list)
+		ret += btree_bytes(c);
+
+	mutex_unlock(&c->btree_cache.lock);
+	return ret;
+}
+
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	enum btree_id id;
+	u64 nr_uncompressed_extents = 0,
+	    nr_compressed_extents = 0,
+	    nr_incompressible_extents = 0,
+	    uncompressed_sectors = 0,
+	    incompressible_sectors = 0,
+	    compressed_sectors_compressed = 0,
+	    compressed_sectors_uncompressed = 0;
+	int ret = 0;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	trans = bch2_trans_get(c);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		if (!btree_type_has_ptrs(id))
+			continue;
+
+		ret = for_each_btree_key2(trans, iter, id, POS_MIN,
+					  BTREE_ITER_ALL_SNAPSHOTS, k, ({
+			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+			const union bch_extent_entry *entry;
+			struct extent_ptr_decoded p;
+			bool compressed = false, uncompressed = false, incompressible = false;
+
+			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+				switch (p.crc.compression_type) {
+				case BCH_COMPRESSION_TYPE_none:
+					uncompressed = true;
+					uncompressed_sectors += k.k->size;
+					break;
+				case BCH_COMPRESSION_TYPE_incompressible:
+					incompressible = true;
+					incompressible_sectors += k.k->size;
+					break;
+				default:
+					compressed_sectors_compressed +=
+						p.crc.compressed_size;
+					compressed_sectors_uncompressed +=
+						p.crc.uncompressed_size;
+					compressed = true;
+					break;
+				}
+			}
+
+			if (incompressible)
+				nr_incompressible_extents++;
+			else if (uncompressed)
+				nr_uncompressed_extents++;
+			else if (compressed)
+				nr_compressed_extents++;
+			0;
+		}));
+	}
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		return ret;
+
+	prt_printf(out, "uncompressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, uncompressed_sectors << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "compressed:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_compressed_extents);
+	prt_printf(out, "	compressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_compressed << 9);
+	prt_printf(out, "\n");
+	prt_printf(out, "	uncompressed size:	");
+	prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
+	prt_printf(out, "\n");
+
+	prt_printf(out, "incompressible:\n");
+	prt_printf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
+	prt_printf(out, "	size:			");
+	prt_human_readable_u64(out, incompressible_sectors << 9);
+	prt_printf(out, "\n");
+	return 0;
+}
+
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+{
+	prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
+	bch2_bpos_to_text(out, c->gc_gens_pos);
+	prt_printf(out, "\n");
+}
+
+static void bch2_btree_wakeup_all(struct bch_fs *c)
+{
+	struct btree_trans *trans;
+
+	seqmutex_lock(&c->btree_trans_lock);
+	list_for_each_entry(trans, &c->btree_trans_list, list) {
+		struct btree_bkey_cached_common *b = READ_ONCE(trans->locking);
+
+		if (b)
+			six_lock_wakeup_all(&b->lock);
+
+	}
+	seqmutex_unlock(&c->btree_trans_lock);
+}
+
+SHOW(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	sysfs_print(minor,			c->minor);
+	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
+
+	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
+
+	if (attr == &sysfs_btree_write_stats)
+		bch2_btree_write_stats_to_text(out, c);
+
+	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
+
+	if (attr == &sysfs_gc_gens_pos)
+		bch2_gc_gens_pos_to_text(out, c);
+
+	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+
+	sysfs_printf(rebalance_enabled,		"%i", c->rebalance.enabled);
+	sysfs_pd_controller_show(rebalance,	&c->rebalance.pd); /* XXX */
+
+	if (attr == &sysfs_copy_gc_wait)
+		bch2_copygc_wait_to_text(out, c);
+
+	if (attr == &sysfs_rebalance_status)
+		bch2_rebalance_status_to_text(out, c);
+
+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+	if (attr == &sysfs_journal_debug)
+		bch2_journal_debug_to_text(out, &c->journal);
+
+	if (attr == &sysfs_btree_updates)
+		bch2_btree_updates_to_text(out, c);
+
+	if (attr == &sysfs_btree_cache)
+		bch2_btree_cache_to_text(out, c);
+
+	if (attr == &sysfs_btree_key_cache)
+		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+
+	if (attr == &sysfs_stripes_heap)
+		bch2_stripes_heap_to_text(out, c);
+
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c);
+
+	if (attr == &sysfs_open_buckets_partial)
+		bch2_open_buckets_partial_to_text(out, c);
+
+	if (attr == &sysfs_write_points)
+		bch2_write_points_to_text(out, c);
+
+	if (attr == &sysfs_compression_stats)
+		bch2_compression_stats_to_text(out, c);
+
+	if (attr == &sysfs_new_stripes)
+		bch2_new_stripes_to_text(out, c);
+
+	if (attr == &sysfs_io_timers_read)
+		bch2_io_timers_to_text(out, &c->io_clock[READ]);
+
+	if (attr == &sysfs_io_timers_write)
+		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
+
+	if (attr == &sysfs_moving_ctxts)
+		bch2_fs_moving_ctxts_to_text(out, c);
+
+#ifdef BCH_WRITE_REF_DEBUG
+	if (attr == &sysfs_write_refs)
+		bch2_write_refs_to_text(out, c);
+#endif
+
+	if (attr == &sysfs_nocow_lock_table)
+		bch2_nocow_locks_to_text(out, &c->nocow_locks);
+
+	if (attr == &sysfs_disk_groups)
+		bch2_disk_groups_to_text(out, c);
+
+	return 0;
+}
+
+STORE(bch2_fs)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
+
+	if (attr == &sysfs_btree_gc_periodic) {
+		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
+			?: (ssize_t) size;
+
+		wake_up_process(c->gc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_copy_gc_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
+			?: (ssize_t) size;
+
+		if (c->copygc_thread)
+			wake_up_process(c->copygc_thread);
+		return ret;
+	}
+
+	if (attr == &sysfs_rebalance_enabled) {
+		ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
+			?: (ssize_t) size;
+
+		rebalance_wakeup(c);
+		return ret;
+	}
+
+	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
+
+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
+	/* Debugging: */
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EPERM;
+
+	/* Debugging: */
+
+	if (!test_bit(BCH_FS_RW, &c->flags))
+		return -EROFS;
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+	}
+
+	if (attr == &sysfs_btree_wakeup)
+		bch2_btree_wakeup_all(c);
+
+	if (attr == &sysfs_trigger_gc) {
+		/*
+		 * Full gc is currently incompatible with btree key cache:
+		 */
+#if 0
+		down_read(&c->state_lock);
+		bch2_gc(c, false, false);
+		up_read(&c->state_lock);
+#else
+		bch2_gc_gens(c);
+#endif
+	}
+
+	if (attr == &sysfs_trigger_discards)
+		bch2_do_discards(c);
+
+	if (attr == &sysfs_trigger_invalidates)
+		bch2_do_invalidates(c);
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	if (attr == &sysfs_perf_test) {
+		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+		char *test		= strsep(&p, " \t\n");
+		char *nr_str		= strsep(&p, " \t\n");
+		char *threads_str	= strsep(&p, " \t\n");
+		unsigned threads;
+		u64 nr;
+		int ret = -EINVAL;
+
+		if (threads_str &&
+		    !(ret = kstrtouint(threads_str, 10, &threads)) &&
+		    !(ret = bch2_strtoull_h(nr_str, &nr)))
+			ret = bch2_btree_perf_test(c, test, nr, threads);
+		kfree(tmp);
+
+		if (ret)
+			size = ret;
+	}
+#endif
+	return size;
+}
+SYSFS_OPS(bch2_fs);
+
+struct attribute *bch2_fs_files[] = {
+	&sysfs_minor,
+	&sysfs_btree_cache_size,
+	&sysfs_btree_write_stats,
+
+	&sysfs_promote_whole_extents,
+
+	&sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+	&sysfs_perf_test,
+#endif
+	NULL
+};
+
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+	u64 counter = 0;
+	u64 counter_since_mount = 0;
+
+	printbuf_tabstop_push(out, 32);
+
+	#define x(t, ...) \
+		if (attr == &sysfs_##t) {					\
+			counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+			counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+			prt_printf(out, "since mount:");				\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter_since_mount);	\
+			prt_newline(out);					\
+										\
+			prt_printf(out, "since filesystem creation:");		\
+			prt_tab(out);						\
+			prt_human_readable_u64(out, counter);			\
+			prt_newline(out);					\
+		}
+	BCH_PERSISTENT_COUNTERS()
+	#undef x
+	return 0;
+}
+
+STORE(bch2_fs_counters) {
+	return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+	&sysfs_##t,
+	BCH_PERSISTENT_COUNTERS()
+#undef x
+	NULL
+};
+/* internal dir - just a wrapper */
+
+SHOW(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+	return bch2_fs_to_text(out, &c->kobj, attr);
+}
+
+STORE(bch2_fs_internal)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
+
+	return bch2_fs_store(&c->kobj, attr, buf, size);
+}
+SYSFS_OPS(bch2_fs_internal);
+
+struct attribute *bch2_fs_internal_files[] = {
+	&sysfs_journal_debug,
+	&sysfs_btree_updates,
+	&sysfs_btree_cache,
+	&sysfs_btree_key_cache,
+	&sysfs_new_stripes,
+	&sysfs_stripes_heap,
+	&sysfs_open_buckets,
+	&sysfs_open_buckets_partial,
+	&sysfs_write_points,
+#ifdef BCH_WRITE_REF_DEBUG
+	&sysfs_write_refs,
+#endif
+	&sysfs_nocow_lock_table,
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
+
+	&sysfs_trigger_gc,
+	&sysfs_trigger_discards,
+	&sysfs_trigger_invalidates,
+	&sysfs_prune_cache,
+	&sysfs_btree_wakeup,
+
+	&sysfs_gc_gens_pos,
+
+	&sysfs_copy_gc_enabled,
+	&sysfs_copy_gc_wait,
+
+	&sysfs_rebalance_enabled,
+	&sysfs_rebalance_status,
+	sysfs_pd_controller_files(rebalance),
+
+	&sysfs_moving_ctxts,
+
+	&sysfs_internal_uuid,
+
+	&sysfs_disk_groups,
+	NULL
+};
+
+/* options */
+
+SHOW(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int id = opt - bch2_opt_table;
+	u64 v = bch2_opt_get_by_id(&c->opts, id);
+
+	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
+	prt_char(out, '\n');
+
+	return 0;
+}
+
+STORE(bch2_fs_opts_dir)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
+	int ret, id = opt - bch2_opt_table;
+	char *tmp;
+	u64 v;
+
+	/*
+	 * We don't need to take c->writes for correctness, but it eliminates an
+	 * unsightly error message in the dmesg log when we're RO:
+	 */
+	if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+		return -EROFS;
+
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+	kfree(tmp);
+
+	if (ret < 0)
+		goto err;
+
+	ret = bch2_opt_check_may_set(c, id, v);
+	if (ret < 0)
+		goto err;
+
+	bch2_opt_set_sb(c, opt, v);
+	bch2_opt_set_by_id(&c->opts, id, v);
+
+	if ((id == Opt_background_target ||
+	     id == Opt_background_compression) && v)
+		bch2_set_rebalance_needs_scan(c, 0);
+
+	ret = size;
+err:
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+	return ret;
+}
+SYSFS_OPS(bch2_fs_opts_dir);
+
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
+
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
+{
+	const struct bch_option *i;
+	int ret;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + bch2_opts_nr;
+	     i++) {
+		if (!(i->flags & OPT_FS))
+			continue;
+
+		ret = sysfs_create_file(kobj, &i->attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* time stats */
+
+SHOW(bch2_fs_time_stats)
+{
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)								\
+	if (attr == &sysfs_time_stat_##name)				\
+		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
+	BCH_TIME_STATS()
+#undef x
+
+	return 0;
+}
+
+STORE(bch2_fs_time_stats)
+{
+	return size;
+}
+SYSFS_OPS(bch2_fs_time_stats);
+
+struct attribute *bch2_fs_time_stats_files[] = {
+#define x(name)						\
+	&sysfs_time_stat_##name,
+	BCH_TIME_STATS()
+#undef x
+	NULL
+};
+
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	struct bch_fs *c = ca->fs;
+	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	unsigned i, nr[BCH_DATA_NR];
+
+	memset(nr, 0, sizeof(nr));
+
+	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
+		nr[c->open_buckets[i].data_type]++;
+
+	printbuf_tabstop_push(out, 8);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+	printbuf_tabstop_push(out, 16);
+
+	prt_tab(out);
+	prt_str(out, "buckets");
+	prt_tab_rjust(out);
+	prt_str(out, "sectors");
+	prt_tab_rjust(out);
+	prt_str(out, "fragmented");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	for (i = 0; i < BCH_DATA_NR; i++) {
+		prt_str(out, bch2_data_types[i]);
+		prt_tab(out);
+		prt_u64(out, stats.d[i].buckets);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].sectors);
+		prt_tab_rjust(out);
+		prt_u64(out, stats.d[i].fragmented);
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_str(out, "ec");
+	prt_tab(out);
+	prt_u64(out, stats.buckets_ec);
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	prt_newline(out);
+
+	prt_printf(out, "reserves:");
+	prt_newline(out);
+	for (i = 0; i < BCH_WATERMARK_NR; i++) {
+		prt_str(out, bch2_watermarks[i]);
+		prt_tab(out);
+		prt_u64(out, bch2_dev_buckets_reserved(ca, i));
+		prt_tab_rjust(out);
+		prt_newline(out);
+	}
+
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, 24);
+
+	prt_str(out, "freelist_wait");
+	prt_tab(out);
+	prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open buckets allocated");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free);
+	prt_newline(out);
+
+	prt_str(out, "open buckets this dev");
+	prt_tab(out);
+	prt_u64(out, ca->nr_open_buckets);
+	prt_newline(out);
+
+	prt_str(out, "open buckets total");
+	prt_tab(out);
+	prt_u64(out, OPEN_BUCKETS_COUNT);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_wait");
+	prt_tab(out);
+	prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty");
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_btree");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_btree]);
+	prt_newline(out);
+
+	prt_str(out, "open_buckets_user");
+	prt_tab(out);
+	prt_u64(out, nr[BCH_DATA_user]);
+	prt_newline(out);
+
+	prt_str(out, "buckets_to_invalidate");
+	prt_tab(out);
+	prt_u64(out, should_invalidate_buckets(ca, stats));
+	prt_newline(out);
+
+	prt_str(out, "btree reserve cache");
+	prt_tab(out);
+	prt_u64(out, c->btree_reserve_cache_nr);
+	prt_newline(out);
+}
+
+static const char * const bch2_rw[] = {
+	"read",
+	"write",
+	NULL
+};
+
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+	int rw, i;
+
+	for (rw = 0; rw < 2; rw++) {
+		prt_printf(out, "%s:\n", bch2_rw[rw]);
+
+		for (i = 1; i < BCH_DATA_NR; i++)
+			prt_printf(out, "%-12s:%12llu\n",
+			       bch2_data_types[i],
+			       percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
+	}
+}
+
+SHOW(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+
+	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
+
+	sysfs_print(bucket_size,	bucket_bytes(ca));
+	sysfs_print(first_bucket,	ca->mi.first_bucket);
+	sysfs_print(nbuckets,		ca->mi.nbuckets);
+	sysfs_print(durability,		ca->mi.durability);
+	sysfs_print(discard,		ca->mi.discard);
+
+	if (attr == &sysfs_label) {
+		if (ca->mi.group)
+			bch2_disk_path_to_text(out, c, ca->mi.group - 1);
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_has_data) {
+		prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_state_rw) {
+		prt_string_option(out, bch2_member_states, ca->mi.state);
+		prt_char(out, '\n');
+	}
+
+	if (attr == &sysfs_io_done)
+		dev_io_done_to_text(out, ca);
+
+	if (attr == &sysfs_io_errors)
+		bch2_dev_io_errors_to_text(out, ca);
+
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+
+	if (attr == &sysfs_io_latency_stats_write)
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_alloc_debug)
+		dev_alloc_debug_to_text(out, ca);
+
+	return 0;
+}
+
+STORE(bch2_dev)
+{
+	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
+	struct bch_fs *c = ca->fs;
+	struct bch_member *mi;
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+		if (v != BCH_MEMBER_DISCARD(mi)) {
+			SET_BCH_MEMBER_DISCARD(mi, v);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_durability) {
+		u64 v = strtoul_or_return(buf);
+
+		mutex_lock(&c->sb_lock);
+		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+
+		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
+			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
+			bch2_write_super(c);
+		}
+		mutex_unlock(&c->sb_lock);
+	}
+
+	if (attr == &sysfs_label) {
+		char *tmp;
+		int ret;
+
+		tmp = kstrdup(buf, GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		ret = bch2_dev_group_set(c, ca, strim(tmp));
+		kfree(tmp);
+		if (ret)
+			return ret;
+	}
+
+	if (attr == &sysfs_io_errors_reset)
+		bch2_dev_errors_reset(ca);
+
+	return size;
+}
+SYSFS_OPS(bch2_dev);
+
+struct attribute *bch2_dev_files[] = {
+	&sysfs_uuid,
+	&sysfs_bucket_size,
+	&sysfs_first_bucket,
+	&sysfs_nbuckets,
+	&sysfs_durability,
+
+	/* settings: */
+	&sysfs_discard,
+	&sysfs_state_rw,
+	&sysfs_label,
+
+	&sysfs_has_data,
+	&sysfs_io_done,
+	&sysfs_io_errors,
+	&sysfs_io_errors_reset,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,
+
+	/* debug: */
+	&sysfs_alloc_debug,
+	NULL
+};
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
new file mode 100644
index 000000000000..222cd5062702
--- /dev/null
+++ b/fs/bcachefs/sysfs.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SYSFS_H_
+#define _BCACHEFS_SYSFS_H_
+
+#include <linux/sysfs.h>
+
+#ifndef NO_BCACHEFS_SYSFS
+
+struct attribute;
+struct sysfs_ops;
+
+extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
+extern struct attribute *bch2_fs_internal_files[];
+extern struct attribute *bch2_fs_opts_dir_files[];
+extern struct attribute *bch2_fs_time_stats_files[];
+extern struct attribute *bch2_dev_files[];
+
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
+
+int bch2_opts_create_sysfs_files(struct kobject *);
+
+#else
+
+static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
+static struct attribute *bch2_fs_internal_files[] = {};
+static struct attribute *bch2_fs_opts_dir_files[] = {};
+static struct attribute *bch2_fs_time_stats_files[] = {};
+static struct attribute *bch2_dev_files[] = {};
+
+static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+static const struct sysfs_ops bch2_dev_sysfs_ops;
+
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+
+#endif /* NO_BCACHEFS_SYSFS */
+
+#endif  /* _BCACHEFS_SYSFS_H_ */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
new file mode 100644
index 000000000000..2fc9e60c754b
--- /dev/null
+++ b/fs/bcachefs/tests.c
@@ -0,0 +1,919 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "journal_reclaim.h"
+#include "snapshot.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void delete_test_keys(struct bch_fs *c)
+{
+	int ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+	BUG_ON(ret);
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+	BUG_ON(ret);
+}
+
+/* unit tests */
+
+static int test_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
+
+	pr_info("deleting once");
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (first)");
+	if (ret)
+		goto err;
+
+	pr_info("deleting twice");
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error (second)");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_delete_written(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k.p.snapshot = U32_MAX;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
+			     BTREE_ITER_INTENT);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, &k.k_i, 0));
+	bch_err_msg(c, ret, "update error");
+	if (ret)
+		goto err;
+
+	bch2_trans_unlock(trans);
+	bch2_journal_flush_all_pins(&c->journal);
+
+	ret = commit_do(trans, NULL, NULL, 0,
+		bch2_btree_iter_traverse(&iter) ?:
+		bch2_btree_delete_at(trans, &iter, 0));
+	bch_err_msg(c, ret, "delete error");
+	if (ret)
+		goto err;
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i;
+		ck.k.p.snapshot = U32_MAX;
+
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(k.k->p.offset != i++);
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != --i);
+			0;
+		}));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test extents");
+
+	for (i = 0; i < nr; i += 8) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 8;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(bkey_start_offset(k.k) != i);
+		i = k.k->p.offset;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating backwards");
+
+	ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+					 SPOS(0, U64_MAX, U32_MAX), 0, k,
+		({
+			BUG_ON(k.k->p.offset != i);
+			i = bkey_start_offset(k.k);
+			0;
+		}));
+	bch_err_msg(c, ret, "error iterating backwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i++) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i * 2;
+		ck.k.p.snapshot = U32_MAX;
+
+		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(k.k->p.offset != i);
+		i += 2;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr * 2);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  BTREE_ITER_SLOTS, k, ({
+		if (i >= nr * 2)
+			break;
+
+		BUG_ON(k.k->p.offset != i);
+		BUG_ON(bkey_deleted(k.k) != (i & 1));
+
+		i++;
+		0;
+	}));
+	if (ret < 0) {
+		bch_err_msg(c, ret, "error iterating forwards by slots");
+		goto err;
+	}
+	ret = 0;
+err:
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter = { NULL };
+	struct bkey_s_c k;
+	u64 i;
+	int ret = 0;
+
+	delete_test_keys(c);
+
+	pr_info("inserting test keys");
+
+	for (i = 0; i < nr; i += 16) {
+		struct bkey_i_cookie ck;
+
+		bkey_cookie_init(&ck.k_i);
+		ck.k.p.offset = i + 16;
+		ck.k.p.snapshot = U32_MAX;
+		ck.k.size = 8;
+
+		ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+		bch_err_msg(c, ret, "insert error");
+		if (ret)
+			goto err;
+	}
+
+	pr_info("iterating forwards");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k, ({
+		BUG_ON(bkey_start_offset(k.k) != i + 8);
+		BUG_ON(k.k->size != 8);
+		i += 16;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards");
+	if (ret)
+		goto err;
+
+	BUG_ON(i != nr);
+
+	pr_info("iterating forwards by slots");
+
+	i = 0;
+
+	ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
+				 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				 BTREE_ITER_SLOTS, k, ({
+		if (i == nr)
+			break;
+		BUG_ON(bkey_deleted(k.k) != !(i % 16));
+
+		BUG_ON(bkey_start_offset(k.k) != i);
+		BUG_ON(k.k->size != 8);
+		i = k.k->p.offset;
+		0;
+	}));
+	bch_err_msg(c, ret, "error iterating forwards by slots");
+	if (ret)
+		goto err;
+	ret = 0;
+err:
+	bch2_trans_put(trans);
+	return 0;
+}
+
+/*
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
+ * tests
+ */
+static int test_peek_end(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
+}
+
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+	BUG_ON(k.k);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return 0;
+}
+
+/* extent unit tests */
+
+static u64 test_version;
+
+static int insert_test_extent(struct bch_fs *c,
+			      u64 start, u64 end)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.offset = end;
+	k.k_i.k.p.snapshot = U32_MAX;
+	k.k_i.k.size = end - start;
+	k.k_i.k.version.lo = test_version++;
+
+	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __test_extent_overwrite(struct bch_fs *c,
+				    u64 e1_start, u64 e1_end,
+				    u64 e2_start, u64 e2_end)
+{
+	int ret;
+
+	ret   = insert_test_extent(c, e1_start, e1_end) ?:
+		insert_test_extent(c, e2_start, e2_end);
+
+	delete_test_keys(c);
+	return ret;
+}
+
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+		__test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+		__test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+	return __test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+	return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
+		__test_extent_overwrite(c, 32, 64,  0, 128) ?:
+		__test_extent_overwrite(c, 32, 64, 32,  64) ?:
+		__test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
+{
+	struct bkey_i_cookie k;
+	int ret;
+
+	bkey_cookie_init(&k.k_i);
+	k.k_i.k.p.inode	= inum;
+	k.k_i.k.p.offset = start + len;
+	k.k_i.k.p.snapshot = snapid;
+	k.k_i.k.size = len;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
+{
+	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
+		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
+		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
+		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
+		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
+}
+
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+	struct btree_trans *trans;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie cookie;
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = snapid_hi;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	if (ret)
+		return ret;
+
+	trans = bch2_trans_get(c);
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, snapid_lo), 0);
+	lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+
+	BUG_ON(k.k->p.snapshot != U32_MAX);
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+	struct bkey_i_cookie cookie;
+	u32 snapids[2];
+	u32 snapid_subvols[2] = { 1, 1 };
+	int ret;
+
+	bkey_cookie_init(&cookie.k_i);
+	cookie.k.p.snapshot = U32_MAX;
+	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+	if (ret)
+		return ret;
+
+	ret = bch2_trans_do(c, NULL, NULL, 0,
+		      bch2_snapshot_node_create(trans, U32_MAX,
+						snapids,
+						snapid_subvols,
+						2));
+	if (ret)
+		return ret;
+
+	if (snapids[0] > snapids[1])
+		swap(snapids[0], snapids[1]);
+
+	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+	bch_err_msg(c, ret, "from test_snapshot_filter");
+	return ret;
+}
+
+/* perf tests */
+
+static u64 test_rand(void)
+{
+	u64 v;
+
+	get_random_bytes(&v, sizeof(v));
+	return v;
+}
+
+static int rand_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k;
+	int ret = 0;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		bkey_cookie_init(&k.k_i);
+		k.k.p.offset = test_rand();
+		k.k.p.snapshot = U32_MAX;
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct bkey_i_cookie k[8];
+	int ret = 0;
+	unsigned j;
+	u64 i;
+
+	for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
+		for (j = 0; j < ARRAY_SIZE(k); j++) {
+			bkey_cookie_init(&k[j].k_i);
+			k[j].k.p.offset = test_rand();
+			k[j].k.p.snapshot = U32_MAX;
+		}
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+			bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+	u64 i;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	for (i = 0; i < nr; i++) {
+		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+
+		lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+		ret = bkey_err(k);
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int rand_mixed_trans(struct btree_trans *trans,
+			    struct btree_iter *iter,
+			    struct bkey_i_cookie *cookie,
+			    u64 i, u64 pos)
+{
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+
+	k = bch2_btree_iter_peek(iter);
+	ret = bkey_err(k);
+	bch_err_msg(trans->c, ret, "lookup error");
+	if (ret)
+		return ret;
+
+	if (!(i & 3) && k.k) {
+		bkey_cookie_init(&cookie->k_i);
+		cookie->k.p = iter->pos;
+		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
+	}
+
+	return ret;
+}
+
+static int rand_mixed(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_i_cookie cookie;
+	int ret = 0;
+	u64 i, rand;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
+
+	for (i = 0; i < nr; i++) {
+		rand = test_rand();
+		ret = commit_do(trans, NULL, NULL, 0,
+			rand_mixed_trans(trans, &iter, &cookie, i, rand));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_iter_exit(trans, &iter);
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+			     BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	if (!k.k)
+		goto err;
+
+	ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int rand_delete(struct bch_fs *c, u64 nr)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = 0;
+	u64 i;
+
+	for (i = 0; i < nr; i++) {
+		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
+
+		ret = commit_do(trans, NULL, NULL, 0,
+			__do_delete(trans, pos));
+		if (ret)
+			break;
+	}
+
+	bch2_trans_put(trans);
+	return ret;
+}
+
+static int seq_insert(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_cookie insert;
+
+	bkey_cookie_init(&insert.k_i);
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
+					NULL, NULL, 0, ({
+			if (iter.pos.offset >= nr)
+				break;
+			insert.k.p = iter.pos;
+			bch2_trans_update(trans, &iter, &insert.k_i, 0);
+		})));
+}
+
+static int seq_lookup(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+				  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+				  0, k,
+		0));
+}
+
+static int seq_overwrite(struct bch_fs *c, u64 nr)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+
+	return bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
+					SPOS(0, 0, U32_MAX),
+					BTREE_ITER_INTENT, k,
+					NULL, NULL, 0, ({
+			struct bkey_i_cookie u;
+
+			bkey_reassemble(&u.k_i, k);
+			bch2_trans_update(trans, &iter, &u.k_i, 0);
+		})));
+}
+
+static int seq_delete(struct bch_fs *c, u64 nr)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_xattrs,
+				      SPOS(0, 0, U32_MAX),
+				      POS(0, U64_MAX),
+				      0, NULL);
+}
+
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+	struct bch_fs			*c;
+	u64				nr;
+	unsigned			nr_threads;
+	perf_test_fn			fn;
+
+	atomic_t			ready;
+	wait_queue_head_t		ready_wait;
+
+	atomic_t			done;
+	struct completion		done_completion;
+
+	u64				start;
+	u64				finish;
+	int				ret;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+	struct test_job *j = data;
+	int ret;
+
+	if (atomic_dec_and_test(&j->ready)) {
+		wake_up(&j->ready_wait);
+		j->start = sched_clock();
+	} else {
+		wait_event(j->ready_wait, !atomic_read(&j->ready));
+	}
+
+	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+	if (ret) {
+		bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret));
+		j->ret = ret;
+	}
+
+	if (atomic_dec_and_test(&j->done)) {
+		j->finish = sched_clock();
+		complete(&j->done_completion);
+	}
+
+	return 0;
+}
+
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+			 u64 nr, unsigned nr_threads)
+{
+	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+	char name_buf[20];
+	struct printbuf nr_buf = PRINTBUF;
+	struct printbuf per_sec_buf = PRINTBUF;
+	unsigned i;
+	u64 time;
+
+	atomic_set(&j.ready, nr_threads);
+	init_waitqueue_head(&j.ready_wait);
+
+	atomic_set(&j.done, nr_threads);
+	init_completion(&j.done_completion);
+
+#define perf_test(_test)				\
+	if (!strcmp(testname, #_test)) j.fn = _test
+
+	perf_test(rand_insert);
+	perf_test(rand_insert_multi);
+	perf_test(rand_lookup);
+	perf_test(rand_mixed);
+	perf_test(rand_delete);
+
+	perf_test(seq_insert);
+	perf_test(seq_lookup);
+	perf_test(seq_overwrite);
+	perf_test(seq_delete);
+
+	/* a unit test, not a perf test: */
+	perf_test(test_delete);
+	perf_test(test_delete_written);
+	perf_test(test_iterate);
+	perf_test(test_iterate_extents);
+	perf_test(test_iterate_slots);
+	perf_test(test_iterate_slots_extents);
+	perf_test(test_peek_end);
+	perf_test(test_peek_end_extents);
+
+	perf_test(test_extent_overwrite_front);
+	perf_test(test_extent_overwrite_back);
+	perf_test(test_extent_overwrite_middle);
+	perf_test(test_extent_overwrite_all);
+	perf_test(test_extent_create_overlapping);
+
+	perf_test(test_snapshots);
+
+	if (!j.fn) {
+		pr_err("unknown test %s", testname);
+		return -EINVAL;
+	}
+
+	//pr_info("running test %s:", testname);
+
+	if (nr_threads == 1)
+		btree_perf_test_thread(&j);
+	else
+		for (i = 0; i < nr_threads; i++)
+			kthread_run(btree_perf_test_thread, &j,
+				    "bcachefs perf test[%u]", i);
+
+	while (wait_for_completion_interruptible(&j.done_completion))
+		;
+
+	time = j.finish - j.start;
+
+	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+	prt_human_readable_u64(&nr_buf, nr);
+	prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
+	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+		name_buf, nr_buf.buf, nr_threads,
+		div_u64(time, NSEC_PER_SEC),
+		div_u64(time * nr_threads, nr),
+		per_sec_buf.buf);
+	printbuf_exit(&per_sec_buf);
+	printbuf_exit(&nr_buf);
+	return j.ret;
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
new file mode 100644
index 000000000000..c73b18aea7e0
--- /dev/null
+++ b/fs/bcachefs/tests.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
new file mode 100644
index 000000000000..dc48b52b01b4
--- /dev/null
+++ b/fs/bcachefs/trace.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_types.h"
+#include "buckets.h"
+#include "btree_cache.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "btree_update_interior.h"
+#include "keylist.h"
+#include "move_types.h"
+#include "opts.h"
+#include "six.h"
+
+#include <linux/blktrace_api.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
new file mode 100644
index 000000000000..fd49b63562c3
--- /dev/null
+++ b/fs/bcachefs/trace.h
@@ -0,0 +1,1327 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcachefs
+
+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHEFS_H
+
+#include <linux/tracepoint.h>
+
+#define TRACE_BPOS_entries(name)				\
+	__field(u64,			name##_inode	)	\
+	__field(u64,			name##_offset	)	\
+	__field(u32,			name##_snapshot	)
+
+#define TRACE_BPOS_assign(dst, src)				\
+	__entry->dst##_inode		= (src).inode;		\
+	__entry->dst##_offset		= (src).offset;		\
+	__entry->dst##_snapshot		= (src).snapshot
+
+DECLARE_EVENT_CLASS(bpos,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		TRACE_BPOS_entries(p)
+	),
+
+	TP_fast_assign(
+		TRACE_BPOS_assign(p, *p);
+	),
+
+	TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bkey,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k),
+
+	TP_STRUCT__entry(
+		__string(k,	k				)
+	),
+
+	TP_fast_assign(
+		__assign_str(k, k);
+	),
+
+	TP_printk("%s", __get_str(k))
+);
+
+DECLARE_EVENT_CLASS(btree_node,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u8,		level			)
+		__field(u8,		btree_id		)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->level		= b->c.level;
+		__entry->btree_id	= b->c.btree_id;
+		TRACE_BPOS_assign(pos, b->key.k.p);
+	),
+
+	TP_printk("%d,%d %u %s %llu:%llu:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->level,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(bch_fs,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+	),
+
+	TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+);
+
+DECLARE_EVENT_CLASS(bio,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev ? bio_dev(bio) : 0;
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector)
+);
+
+/* super-io.c: */
+TRACE_EVENT(write_super,
+	TP_PROTO(struct bch_fs *c, unsigned long ip),
+	TP_ARGS(c, ip),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(unsigned long,	ip	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->ip		= ip;
+	),
+
+	TP_printk("%d,%d for %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (void *) __entry->ip)
+);
+
+/* io.c: */
+
+DEFINE_EVENT(bio, read_promote,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(read_nopromote,
+	TP_PROTO(struct bch_fs *c, int ret),
+	TP_ARGS(c, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__array(char,		ret, 32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%d,%d ret %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ret)
+);
+
+DEFINE_EVENT(bio, read_bounce,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_split,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_retry,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, read_reuse_race,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+/* Journal */
+
+DEFINE_EVENT(bch_fs, journal_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, journal_entry_full,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bio, journal_write,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio)
+);
+
+TRACE_EVENT(journal_reclaim_start,
+	TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+		 u64 min_nr, u64 min_key_cache,
+		 u64 btree_cache_dirty, u64 btree_cache_total,
+		 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+	TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
+		btree_cache_dirty, btree_cache_total,
+		btree_key_cache_dirty, btree_key_cache_total),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(bool,		direct			)
+		__field(bool,		kicked			)
+		__field(u64,		min_nr			)
+		__field(u64,		min_key_cache		)
+		__field(u64,		btree_cache_dirty	)
+		__field(u64,		btree_cache_total	)
+		__field(u64,		btree_key_cache_dirty	)
+		__field(u64,		btree_key_cache_total	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->direct			= direct;
+		__entry->kicked			= kicked;
+		__entry->min_nr			= min_nr;
+		__entry->min_key_cache		= min_key_cache;
+		__entry->btree_cache_dirty	= btree_cache_dirty;
+		__entry->btree_cache_total	= btree_cache_total;
+		__entry->btree_key_cache_dirty	= btree_key_cache_dirty;
+		__entry->btree_key_cache_total	= btree_key_cache_total;
+	),
+
+	TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->direct,
+		  __entry->kicked,
+		  __entry->min_nr,
+		  __entry->min_key_cache,
+		  __entry->btree_cache_dirty,
+		  __entry->btree_cache_total,
+		  __entry->btree_key_cache_dirty,
+		  __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+	TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+	TP_ARGS(c, nr_flushed),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		nr_flushed		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->nr_flushed	= nr_flushed;
+	),
+
+	TP_printk("%d,%d flushed %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_flushed)
+);
+
+/* bset.c: */
+
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
+	TP_PROTO(const struct bpos *p),
+	TP_ARGS(p)
+);
+
+/* Btree cache: */
+
+TRACE_EVENT(btree_cache_scan,
+	TP_PROTO(long nr_to_scan, long can_free, long ret),
+	TP_ARGS(nr_to_scan, can_free, ret),
+
+	TP_STRUCT__entry(
+		__field(long,	nr_to_scan		)
+		__field(long,	can_free		)
+		__field(long,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->can_free	= can_free;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("scanned for %li nodes, can free %li, ret %li",
+		  __entry->nr_to_scan, __entry->can_free, __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_cache_reap,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
+	TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
+	TP_ARGS(b, bytes, sectors),
+
+	TP_STRUCT__entry(
+		__field(enum btree_node_type,	type)
+		__field(unsigned,	bytes			)
+		__field(unsigned,	sectors			)
+	),
+
+	TP_fast_assign(
+		__entry->type	= btree_node_type(b);
+		__entry->bytes	= bytes;
+		__entry->sectors = sectors;
+	),
+
+	TP_printk("bkey type %u bytes %u sectors %u",
+		  __entry->type , __entry->bytes, __entry->sectors)
+);
+
+DEFINE_EVENT(btree_node, btree_node_alloc,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_free,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_reserve_get_fail,
+	TP_PROTO(const char *trans_fn,
+		 unsigned long caller_ip,
+		 size_t required,
+		 int ret),
+	TP_ARGS(trans_fn, caller_ip, required, ret),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(size_t,			required	)
+		__array(char,			ret, 32		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->required	= required;
+		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+	),
+
+	TP_printk("%s %pS required %zu ret %s",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->required,
+		  __entry->ret)
+);
+
+DEFINE_EVENT(btree_node, btree_node_compact,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_merge,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_split,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_rewrite,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+DEFINE_EVENT(btree_node, btree_node_set_root,
+	TP_PROTO(struct bch_fs *c, struct btree *b),
+	TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_path_relock_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__array(char,			node, 24	)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct btree *b = btree_path_node(path, level);
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= path->level;
+		TRACE_BPOS_assign(pos, path->pos);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+
+		if (IS_ERR(b)) {
+			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+		} else {
+			c = six_lock_counts(&path->l[level].b->c.lock);
+			__entry->read_count	= c.n[SIX_LOCK_read];
+			__entry->intent_count	= c.n[SIX_LOCK_intent];
+			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+		}
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->node,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+TRACE_EVENT(btree_path_upgrade_fail,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned level),
+	TP_ARGS(trans, caller_ip, path, level),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			locked		)
+		__field(u8,			self_read_count	)
+		__field(u8,			self_intent_count)
+		__field(u8,			read_count	)
+		__field(u8,			intent_count	)
+		__field(u32,			iter_lock_seq	)
+		__field(u32,			node_lock_seq	)
+	),
+
+	TP_fast_assign(
+		struct six_lock_count c;
+
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= level;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->locked			= btree_node_locked(path, level);
+
+		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+		__entry->self_read_count	= c.n[SIX_LOCK_read];
+		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
+		c = six_lock_counts(&path->l[level].b->c.lock);
+		__entry->read_count		= c.n[SIX_LOCK_read];
+		__entry->intent_count		= c.n[SIX_LOCK_intent];
+		__entry->iter_lock_seq		= path->l[level].lock_seq;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->locked,
+		  __entry->self_read_count,
+		  __entry->self_intent_count,
+		  __entry->read_count,
+		  __entry->intent_count,
+		  __entry->iter_lock_seq,
+		  __entry->node_lock_seq)
+);
+
+/* Garbage collection */
+
+DEFINE_EVENT(bch_fs, gc_gens_start,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, gc_gens_end,
+	TP_PROTO(struct bch_fs *c),
+	TP_ARGS(c)
+);
+
+/* Allocator */
+
+DECLARE_EVENT_CLASS(bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err),
+
+	TP_STRUCT__entry(
+		__field(u8,			dev			)
+		__array(char,	reserve,	16			)
+		__field(u64,			bucket	)
+		__field(u64,			free			)
+		__field(u64,			avail			)
+		__field(u64,			copygc_wait_amount	)
+		__field(s64,			copygc_waiting_for	)
+		__field(u64,			seen			)
+		__field(u64,			open			)
+		__field(u64,			need_journal_commit	)
+		__field(u64,			nouse			)
+		__field(bool,			nonblocking		)
+		__field(u64,			nocow			)
+		__array(char,			err,	32		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ca->dev_idx;
+		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
+		__entry->bucket		= bucket;
+		__entry->free		= free;
+		__entry->avail		= avail;
+		__entry->copygc_wait_amount	= copygc_wait_amount;
+		__entry->copygc_waiting_for	= copygc_waiting_for;
+		__entry->seen		= s->buckets_seen;
+		__entry->open		= s->skipped_open;
+		__entry->need_journal_commit = s->skipped_need_journal_commit;
+		__entry->nouse		= s->skipped_nouse;
+		__entry->nonblocking	= nonblocking;
+		__entry->nocow		= s->skipped_nocow;
+		strscpy(__entry->err, err, sizeof(__entry->err));
+	),
+
+	TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s",
+		  __entry->reserve,
+		  __entry->dev,
+		  __entry->bucket,
+		  __entry->free,
+		  __entry->avail,
+		  __entry->copygc_wait_amount,
+		  __entry->copygc_waiting_for,
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->nouse,
+		  __entry->nocow,
+		  __entry->nonblocking,
+		  __entry->err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err)
+);
+
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
+		 u64 bucket,
+		 u64 free,
+		 u64 avail,
+		 u64 copygc_wait_amount,
+		 s64 copygc_waiting_for,
+		 struct bucket_alloc_state *s,
+		 bool nonblocking,
+		 const char *err),
+	TP_ARGS(ca, alloc_reserve, bucket, free, avail,
+		copygc_wait_amount, copygc_waiting_for,
+		s, nonblocking, err)
+);
+
+TRACE_EVENT(discard_buckets,
+	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+		 u64 need_journal_commit, u64 discarded, const char *err),
+	TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		seen			)
+		__field(u64,		open			)
+		__field(u64,		need_journal_commit	)
+		__field(u64,		discarded		)
+		__array(char,		err,	16		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->seen			= seen;
+		__entry->open			= open;
+		__entry->need_journal_commit	= need_journal_commit;
+		__entry->discarded		= discarded;
+		strscpy(__entry->err, err, sizeof(__entry->err));
+	),
+
+	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->seen,
+		  __entry->open,
+		  __entry->need_journal_commit,
+		  __entry->discarded,
+		  __entry->err)
+);
+
+TRACE_EVENT(bucket_invalidate,
+	TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
+	TP_ARGS(c, dev, bucket, sectors),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u32,		sectors			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= dev;
+		__entry->sectors	= sectors;
+		__entry->bucket		= bucket;
+	),
+
+	TP_printk("%d:%d invalidated %u:%llu cached sectors %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket,
+		  __entry->sectors)
+);
+
+/* Moving IO */
+
+TRACE_EVENT(bucket_evacuate,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket),
+	TP_ARGS(c, bucket),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		dev_idx			)
+		__field(u64,		bucket			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->dev_idx	= bucket->inode;
+		__entry->bucket		= bucket->offset;
+	),
+
+	TP_printk("%d:%d %u:%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dev_idx, __entry->bucket)
+);
+
+DEFINE_EVENT(bkey, move_extent,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_read,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_write,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+DEFINE_EVENT(bkey, move_extent_finish,
+	TP_PROTO(struct bch_fs *c, const char *k),
+	TP_ARGS(c, k)
+);
+
+TRACE_EVENT(move_extent_fail,
+	TP_PROTO(struct bch_fs *c, const char *msg),
+	TP_ARGS(c, msg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__string(msg,		msg			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__assign_str(msg, msg);
+	),
+
+	TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+);
+
+DEFINE_EVENT(bkey, move_extent_start_fail,
+	TP_PROTO(struct bch_fs *c, const char *str),
+	TP_ARGS(c, str)
+);
+
+TRACE_EVENT(move_data,
+	TP_PROTO(struct bch_fs *c,
+		 struct bch_move_stats *stats),
+	TP_ARGS(c, stats),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		keys_moved	)
+		__field(u64,		keys_raced	)
+		__field(u64,		sectors_seen	)
+		__field(u64,		sectors_moved	)
+		__field(u64,		sectors_raced	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->keys_moved	= atomic64_read(&stats->keys_moved);
+		__entry->keys_raced	= atomic64_read(&stats->keys_raced);
+		__entry->sectors_seen	= atomic64_read(&stats->sectors_seen);
+		__entry->sectors_moved	= atomic64_read(&stats->sectors_moved);
+		__entry->sectors_raced	= atomic64_read(&stats->sectors_raced);
+	),
+
+	TP_printk("%d,%d keys moved %llu raced %llu"
+		  "sectors seen %llu moved %llu raced %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->keys_moved,
+		  __entry->keys_raced,
+		  __entry->sectors_seen,
+		  __entry->sectors_moved,
+		  __entry->sectors_raced)
+);
+
+TRACE_EVENT(evacuate_bucket,
+	TP_PROTO(struct bch_fs *c, struct bpos *bucket,
+		 unsigned sectors, unsigned bucket_size,
+		 u64 fragmentation, int ret),
+	TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev		)
+		__field(u64,		member		)
+		__field(u64,		bucket		)
+		__field(u32,		sectors		)
+		__field(u32,		bucket_size	)
+		__field(u64,		fragmentation	)
+		__field(int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->member			= bucket->inode;
+		__entry->bucket			= bucket->offset;
+		__entry->sectors		= sectors;
+		__entry->bucket_size		= bucket_size;
+		__entry->fragmentation		= fragmentation;
+		__entry->ret			= ret;
+	),
+
+	TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->member, __entry->bucket,
+		  __entry->sectors, __entry->bucket_size,
+		  __entry->fragmentation, __entry->ret)
+);
+
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_fs *c,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(c,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
+	),
+
+	TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->sectors_moved, __entry->sectors_not_moved,
+		  __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+	TP_PROTO(struct bch_fs *c,
+		 u64 wait_amount, u64 until),
+	TP_ARGS(c, wait_amount, until),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u64,		wait_amount		)
+		__field(u64,		until			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= c->dev;
+		__entry->wait_amount	= wait_amount;
+		__entry->until		= until;
+	),
+
+	TP_printk("%d,%u waiting for %llu sectors until %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->wait_amount, __entry->until)
+);
+
+/* btree transactions: */
+
+DECLARE_EVENT_CLASS(transaction_event,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+	),
+
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	transaction_commit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_injected,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_split_race,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			level		)
+		__field(u16,			written		)
+		__field(u16,			blocks		)
+		__field(u16,			u64s_remaining	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->level		= b->c.level;
+		__entry->written	= b->written;
+		__entry->blocks		= btree_blocks(trans->c);
+		__entry->u64s_remaining	= bch_btree_keys_u64s_remaining(trans->c, b);
+	),
+
+	TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->level,
+		  __entry->written, __entry->blocks,
+		  __entry->u64s_remaining)
+);
+
+DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_journal_preres_get,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned flags),
+	TP_ARGS(trans, caller_ip, flags),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned,		flags		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->flags			= flags;
+	),
+
+	TP_printk("%s %pS %x", __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->flags)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_fault_inject,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_traverse_all,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_raced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_too_many_iters,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DECLARE_EVENT_CLASS(transaction_restart_iter,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+struct get_locks_fail;
+
+TRACE_EVENT(trans_restart_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_locks_want,
+		 unsigned new_locks_want,
+		 struct get_locks_fail *f),
+	TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			old_locks_want	)
+		__field(u8,			new_locks_want	)
+		__field(u8,			level		)
+		__field(u32,			path_seq	)
+		__field(u32,			node_seq	)
+		__field(u32,			path_alloc_seq	)
+		__field(u32,			downgrade_seq)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= path->btree_id;
+		__entry->old_locks_want		= old_locks_want;
+		__entry->new_locks_want		= new_locks_want;
+		__entry->level			= f->l;
+		__entry->path_seq		= path->l[f->l].lock_seq;
+		__entry->node_seq		= IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+		__entry->path_alloc_seq		= path->alloc_seq;
+		__entry->downgrade_seq		= path->downgrade_seq;
+		TRACE_BPOS_assign(pos, path->pos)
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_locks_want,
+		  __entry->new_locks_want,
+		  __entry->level,
+		  __entry->path_seq,
+		  __entry->node_seq,
+		  __entry->path_alloc_seq,
+		  __entry->downgrade_seq)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_key_cache_upgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_would_deadlock_recursion_limit,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(trans_restart_would_deadlock_write,
+	TP_PROTO(struct btree_trans *trans),
+	TP_ARGS(trans),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+	),
+
+	TP_printk("%s", __entry->trans_fn)
+);
+
+TRACE_EVENT(trans_restart_mem_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 unsigned long bytes),
+	TP_ARGS(trans, caller_ip, bytes),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(unsigned long,		bytes		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip	= caller_ip;
+		__entry->bytes		= bytes;
+	),
+
+	TP_printk("%s %pS bytes %lu",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  __entry->bytes)
+);
+
+TRACE_EVENT(trans_restart_key_cache_key_realloced,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path,
+		 unsigned old_u64s,
+		 unsigned new_u64s),
+	TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(enum btree_id,		btree_id	)
+		TRACE_BPOS_entries(pos)
+		__field(u32,			old_u64s	)
+		__field(u32,			new_u64s	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+
+		__entry->btree_id	= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->old_u64s	= old_u64s;
+		__entry->new_u64s	= new_u64s;
+	),
+
+	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->old_u64s,
+		  __entry->new_u64s)
+);
+
+TRACE_EVENT(path_downgrade,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_path *path),
+	TP_ARGS(trans, caller_ip, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+	),
+
+	TP_printk("%s %pS",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,	trans_restart_write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+	TP_ARGS(trans, nr, skipped, fast, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		skipped		)
+		__field(size_t,		fast		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->skipped = skipped;
+		__entry->fast	= fast;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu skipped %zu fast %zu",
+		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+	TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+	TP_ARGS(trans, nr, size),
+
+	TP_STRUCT__entry(
+		__field(size_t,		nr		)
+		__field(size_t,		size		)
+	),
+
+	TP_fast_assign(
+		__entry->nr	= nr;
+		__entry->size	= size;
+	),
+
+	TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
+#endif /* _TRACE_BCACHEFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../fs/bcachefs
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c
new file mode 100644
index 000000000000..9764c2e6a910
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "two_state_shared_lock.h"
+
+void __bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	__wait_event(lock->wait, bch2_two_state_trylock(lock, s));
+}
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
new file mode 100644
index 000000000000..905801772002
--- /dev/null
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_TWO_STATE_LOCK_H
+#define _BCACHEFS_TWO_STATE_LOCK_H
+
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include "util.h"
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+typedef struct {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+} two_state_lock_t;
+
+static inline void two_state_lock_init(two_state_lock_t *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+
+	EBUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
+{
+	long i = s ? 1 : -1;
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+void __bch2_two_state_lock(two_state_lock_t *, int);
+
+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s)
+{
+	if (!bch2_two_state_trylock(lock, s))
+		__bch2_two_state_lock(lock, s);
+}
+
+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
new file mode 100644
index 000000000000..84b142fcc3df
--- /dev/null
+++ b/fs/bcachefs/util.c
@@ -0,0 +1,1159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/log2.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "eytzinger.h"
+#include "mean_and_variance.h"
+#include "util.h"
+
+static const char si_units[] = "?kMGTPEZY";
+
+/* string_get_size units: */
+static const char *const units_2[] = {
+	"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"
+};
+static const char *const units_10[] = {
+	"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
+};
+
+static int parse_u64(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0;
+
+	if (!isdigit(*cp))
+		return -EINVAL;
+
+	do {
+		if (v > U64_MAX / 10)
+			return -ERANGE;
+		v *= 10;
+		if (v > U64_MAX - (*cp - '0'))
+			return -ERANGE;
+		v += *cp - '0';
+		cp++;
+	} while (isdigit(*cp));
+
+	*res = v;
+	return cp - start;
+}
+
+static int bch2_pow(u64 n, u64 p, u64 *res)
+{
+	*res = 1;
+
+	while (p--) {
+		if (*res > div_u64(U64_MAX, n))
+			return -ERANGE;
+		*res *= n;
+	}
+	return 0;
+}
+
+static int parse_unit_suffix(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 base = 1024;
+	unsigned u;
+	int ret;
+
+	if (*cp == ' ')
+		cp++;
+
+	for (u = 1; u < strlen(si_units); u++)
+		if (*cp == si_units[u]) {
+			cp++;
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_2); u++)
+		if (!strncmp(cp, units_2[u], strlen(units_2[u]))) {
+			cp += strlen(units_2[u]);
+			goto got_unit;
+		}
+
+	for (u = 0; u < ARRAY_SIZE(units_10); u++)
+		if (!strncmp(cp, units_10[u], strlen(units_10[u]))) {
+			cp += strlen(units_10[u]);
+			base = 1000;
+			goto got_unit;
+		}
+
+	*res = 1;
+	return 0;
+got_unit:
+	ret = bch2_pow(base, u, res);
+	if (ret)
+		return ret;
+
+	return cp - start;
+}
+
+#define parse_or_ret(cp, _f)			\
+do {						\
+	int _ret = _f;				\
+	if (_ret < 0)				\
+		return _ret;			\
+	cp += _ret;				\
+} while (0)
+
+static int __bch2_strtou64_h(const char *cp, u64 *res)
+{
+	const char *start = cp;
+	u64 v = 0, b, f_n = 0, f_d = 1;
+	int ret;
+
+	parse_or_ret(cp, parse_u64(cp, &v));
+
+	if (*cp == '.') {
+		cp++;
+		ret = parse_u64(cp, &f_n);
+		if (ret < 0)
+			return ret;
+		cp += ret;
+
+		ret = bch2_pow(10, ret, &f_d);
+		if (ret)
+			return ret;
+	}
+
+	parse_or_ret(cp, parse_unit_suffix(cp, &b));
+
+	if (v > div_u64(U64_MAX, b))
+		return -ERANGE;
+	v *= b;
+
+	if (f_n > div_u64(U64_MAX, b))
+		return -ERANGE;
+
+	f_n = div_u64(f_n * b, f_d);
+	if (v + f_n < v)
+		return -ERANGE;
+	v += f_n;
+
+	*res = v;
+	return cp - start;
+}
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+			 u64 t_max, bool t_signed)
+{
+	bool positive = *cp != '-';
+	u64 v = 0;
+
+	if (*cp == '+' || *cp == '-')
+		cp++;
+
+	parse_or_ret(cp, __bch2_strtou64_h(cp, &v));
+
+	if (*cp == '\n')
+		cp++;
+	if (*cp)
+		return -EINVAL;
+
+	if (positive) {
+		if (v > t_max)
+			return -ERANGE;
+	} else {
+		if (v && !t_signed)
+			return -ERANGE;
+
+		if (v > t_max + 1)
+			return -ERANGE;
+		v = -v;
+	}
+
+	*res = v;
+	return 0;
+}
+
+#define STRTO_H(name, type)					\
+int bch2_ ## name ## _h(const char *cp, type *res)		\
+{								\
+	u64 v = 0;						\
+	int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),	\
+			ANYSINT_MAX(type) != ((type) ~0ULL));	\
+	*res = v;						\
+	return ret;						\
+}
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+STRTO_H(strtou64, u64)
+
+u64 bch2_read_flag_list(char *opt, const char * const list[])
+{
+	u64 ret = 0;
+	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
+
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	while ((p = strsep(&s, ","))) {
+		int flag = match_string(list, -1, p);
+
+		if (flag < 0) {
+			ret = -1;
+			break;
+		}
+
+		ret |= 1 << flag;
+	}
+
+	kfree(d);
+
+	return ret;
+}
+
+bool bch2_is_zero(const void *_p, size_t n)
+{
+	const char *p = _p;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+
+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+{
+	while (nr_bits)
+		prt_char(out, '0' + ((v >> --nr_bits) & 1));
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+	const char *p;
+
+	if (!lines) {
+		printk("%s (null)\n", prefix);
+		return;
+	}
+
+	console_lock();
+	while (1) {
+		p = strchrnul(lines, '\n');
+		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+		if (!*p)
+			break;
+		lines = p + 1;
+	}
+	console_unlock();
+}
+
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+{
+#ifdef CONFIG_STACKTRACE
+	unsigned nr_entries = 0;
+	int ret = 0;
+
+	stack->nr = 0;
+	ret = darray_make_room(stack, 32);
+	if (ret)
+		return ret;
+
+	if (!down_read_trylock(&task->signal->exec_update_lock))
+		return -1;
+
+	do {
+		nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+	} while (nr_entries == stack->size &&
+		 !(ret = darray_make_room(stack, stack->size * 2)));
+
+	stack->nr = nr_entries;
+	up_read(&task->signal->exec_update_lock);
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
+{
+	unsigned long *i;
+
+	darray_for_each(*stack, i) {
+		prt_printf(out, "[<0>] %pB", (void *) *i);
+		prt_newline(out);
+	}
+}
+
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+{
+	bch_stacktrace stack = { 0 };
+	int ret = bch2_save_backtrace(&stack, task);
+
+	bch2_prt_backtrace(out, &stack);
+	darray_exit(&stack);
+	return ret;
+}
+
+/* time stats: */
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct bch2_quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+
+	if (time_after64(end, start)) {
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		bch2_quantiles_update(&stats->quantiles, duration);
+	}
+
+	if (time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+		stats->last_event = end;
+	}
+}
+
+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+						  struct bch2_time_stat_buffer *b)
+{
+	struct bch2_time_stat_buffer_entry *i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	for (i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		bch2_time_stats_update_one(stats, i->start, i->end);
+	spin_unlock_irqrestore(&stats->lock, flags);
+
+	b->nr = 0;
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
+		       "time_stats: min_duration = %llu, min_freq = %llu",
+		       stats->min_duration, stats->min_freq);
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct bch2_time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct bch2_time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			bch2_time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+#endif
+
+static const struct time_unit {
+	const char	*name;
+	u64		nsecs;
+} time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "eon",        U64_MAX          },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
+	prt_tab_rjust(out);
+	prt_printf(out, "%s", u->name);
+}
+
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+	time_t t = sec;
+	char buf[64];
+	ctime_r(&t, buf);
+	prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+	char buf[64];
+	snprintf(buf, sizeof(buf), "%ptT", &sec);
+	prt_u64(out, sec);
+}
+#endif
+
+#define TABSTOP_SIZE 12
+
+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
+{
+	prt_str(out, name);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, ns);
+	prt_newline(out);
+}
+
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+{
+	const struct time_unit *u;
+	s64 f_mean = 0, d_mean = 0;
+	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
+	int i;
+	/*
+	 * avoid divide by zero
+	 */
+	if (stats->freq_stats.n) {
+		f_mean = mean_and_variance_get_mean(stats->freq_stats);
+		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+		d_mean = mean_and_variance_get_mean(stats->duration_stats);
+		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+	}
+
+	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
+	prt_printf(out, "count:");
+	prt_tab(out);
+	prt_printf(out, "%llu ",
+			 stats->duration_stats.n);
+	printbuf_tabstop_pop(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+	printbuf_tabstop_push(out, 0);
+	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
+
+	prt_tab(out);
+	prt_printf(out, "since mount");
+	prt_tab_rjust(out);
+	prt_tab(out);
+	prt_printf(out, "recent");
+	prt_tab_rjust(out);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+	printbuf_tabstop_push(out, out->indent + 20);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+	printbuf_tabstop_push(out, 2);
+	printbuf_tabstop_push(out, TABSTOP_SIZE);
+
+	prt_printf(out, "duration of events");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_duration);
+	pr_name_and_units(out, "max:", stats->max_duration);
+
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, d_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	prt_printf(out, "time between events");
+	prt_newline(out);
+	printbuf_indent_add(out, 2);
+
+	pr_name_and_units(out, "min:", stats->min_freq);
+	pr_name_and_units(out, "max:", stats->max_freq);
+
+	prt_printf(out, "mean:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_mean);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	prt_newline(out);
+
+	prt_printf(out, "stddev:");
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, f_stddev);
+	prt_tab(out);
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+
+	printbuf_indent_sub(out, 2);
+	prt_newline(out);
+
+	printbuf_tabstops_reset(out);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	prt_printf(out, "quantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		prt_printf(out, "%llu ",
+		       div_u64(q, u->nsecs));
+		if (is_last)
+			prt_newline(out);
+		last_q = q;
+	}
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->duration_stats_weighted.weight = 8;
+	stats->freq_stats_weighted.weight = 8;
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
+/**
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
+ *		some work
+ * @d:		the struct bch_ratelimit to update
+ * Returns:	the amount of time to delay by, in jiffies
+ */
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
+{
+	u64 now = local_clock();
+
+	return time_after64(d->next, now)
+		? nsecs_to_jiffies(d->next - now)
+		: 0;
+}
+
+/**
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
+ * @d:		the struct bch_ratelimit to update
+ * @done:	the amount of work done, in arbitrary units
+ */
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
+{
+	u64 now = local_clock();
+
+	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+	if (time_before64(now + NSEC_PER_SEC, d->next))
+		d->next = now + NSEC_PER_SEC;
+
+	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+		d->next = now - NSEC_PER_SEC * 2;
+}
+
+/* pd controller: */
+
+/*
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
+ * @target: desired value
+ * @actual: current value
+ *
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
+ * it makes actual go down.
+ */
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
+			      s64 target, s64 actual, int sign)
+{
+	s64 proportional, derivative, change;
+
+	unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
+
+	if (seconds_since_update == 0)
+		return;
+
+	pd->last_update = jiffies;
+
+	proportional = actual - target;
+	proportional *= seconds_since_update;
+	proportional = div_s64(proportional, pd->p_term_inverse);
+
+	derivative = actual - pd->last_actual;
+	derivative = div_s64(derivative, seconds_since_update);
+	derivative = ewma_add(pd->smoothed_derivative, derivative,
+			      (pd->d_term / seconds_since_update) ?: 1);
+	derivative = derivative * pd->d_term;
+	derivative = div_s64(derivative, pd->p_term_inverse);
+
+	change = proportional + derivative;
+
+	/* Don't increase rate if not keeping up */
+	if (change > 0 &&
+	    pd->backpressure &&
+	    time_after64(local_clock(),
+			 pd->rate.next + NSEC_PER_MSEC))
+		change = 0;
+
+	change *= (sign * -1);
+
+	pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
+				1, UINT_MAX);
+
+	pd->last_actual		= actual;
+	pd->last_derivative	= derivative;
+	pd->last_proportional	= proportional;
+	pd->last_change		= change;
+	pd->last_target		= target;
+}
+
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
+{
+	pd->rate.rate		= 1024;
+	pd->last_update		= jiffies;
+	pd->p_term_inverse	= 6000;
+	pd->d_term		= 30;
+	pd->d_smooth		= pd->d_term;
+	pd->backpressure	= 1;
+}
+
+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
+{
+	if (!out->nr_tabstops)
+		printbuf_tabstop_push(out, 20);
+
+	prt_printf(out, "rate:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->rate.rate);
+	prt_newline(out);
+
+	prt_printf(out, "target:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_target);
+	prt_newline(out);
+
+	prt_printf(out, "actual:");
+	prt_tab(out);
+	prt_human_readable_u64(out, pd->last_actual);
+	prt_newline(out);
+
+	prt_printf(out, "proportional:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_proportional);
+	prt_newline(out);
+
+	prt_printf(out, "derivative:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_derivative);
+	prt_newline(out);
+
+	prt_printf(out, "change:");
+	prt_tab(out);
+	prt_human_readable_s64(out, pd->last_change);
+	prt_newline(out);
+
+	prt_printf(out, "next io:");
+	prt_tab(out);
+	prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
+	prt_newline(out);
+}
+
+/* misc: */
+
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
+{
+	while (size) {
+		struct page *page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+		unsigned offset = offset_in_page(base);
+		unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
+	}
+}
+
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+{
+	while (size) {
+		struct page *page = alloc_pages(gfp_mask, 0);
+		unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+		if (!page)
+			return -ENOMEM;
+
+		if (unlikely(!bio_add_page(bio, page, len, 0))) {
+			__free_page(page);
+			break;
+		}
+
+		size -= len;
+	}
+
+	return 0;
+}
+
+size_t bch2_rand_range(size_t max)
+{
+	size_t rand;
+
+	if (!max)
+		return 0;
+
+	do {
+		rand = get_random_long();
+		rand &= roundup_pow_of_two(max) - 1;
+	} while (rand >= max);
+
+	return rand;
+}
+
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, dst, iter, dst_iter) {
+		void *dstp = kmap_local_page(bv.bv_page);
+
+		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
+		kunmap_local(dstp);
+
+		src += bv.bv_len;
+	}
+}
+
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+
+	__bio_for_each_segment(bv, src, iter, src_iter) {
+		void *srcp = kmap_local_page(bv.bv_page);
+
+		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
+		kunmap_local(srcp);
+
+		dst += bv.bv_len;
+	}
+}
+
+static int alignment_ok(const void *base, size_t align)
+{
+	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+		((unsigned long)base & (align - 1)) == 0;
+}
+
+static void u32_swap(void *a, void *b, size_t size)
+{
+	u32 t = *(u32 *)a;
+	*(u32 *)a = *(u32 *)b;
+	*(u32 *)b = t;
+}
+
+static void u64_swap(void *a, void *b, size_t size)
+{
+	u64 t = *(u64 *)a;
+	*(u64 *)a = *(u64 *)b;
+	*(u64 *)b = t;
+}
+
+static void generic_swap(void *a, void *b, size_t size)
+{
+	char t;
+
+	do {
+		t = *(char *)a;
+		*(char *)a++ = *(char *)b;
+		*(char *)b++ = t;
+	} while (--size > 0);
+}
+
+static inline int do_cmp(void *base, size_t n, size_t size,
+			 int (*cmp_func)(const void *, const void *, size_t),
+			 size_t l, size_t r)
+{
+	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
+			base + inorder_to_eytzinger0(r, n) * size,
+			size);
+}
+
+static inline void do_swap(void *base, size_t n, size_t size,
+			   void (*swap_func)(void *, void *, size_t),
+			   size_t l, size_t r)
+{
+	swap_func(base + inorder_to_eytzinger0(l, n) * size,
+		  base + inorder_to_eytzinger0(r, n) * size,
+		  size);
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     int (*cmp_func)(const void *, const void *, size_t),
+		     void (*swap_func)(void *, void *, size_t))
+{
+	int i, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		do_swap(base, n, size, swap_func, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
+				c++;
+
+			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
+				break;
+
+			do_swap(base, n, size, swap_func, r, c);
+		}
+	}
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t size))
+{
+	/* pre-scale counters for performance */
+	int i = (num/2 - 1) * size, n = num * size, c, r;
+
+	if (!swap_func) {
+		if (size == 4 && alignment_ok(base, 4))
+			swap_func = u32_swap;
+		else if (size == 8 && alignment_ok(base, 8))
+			swap_func = u64_swap;
+		else
+			swap_func = generic_swap;
+	}
+
+	/* heapify */
+	for ( ; i >= 0; i -= size) {
+		for (r = i; r * 2 + size < n; r  = c) {
+			c = r * 2 + size;
+			if (c < n - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+
+	/* sort */
+	for (i = n - size; i > 0; i -= size) {
+		swap_func(base, base + i, size);
+		for (r = 0; r * 2 + size < i; r = c) {
+			c = r * 2 + size;
+			if (c < i - size &&
+			    cmp_func(base + c, base + c + size, size) < 0)
+				c += size;
+			if (cmp_func(base + r, base + c, size) >= 0)
+				break;
+			swap_func(base + r, base + c, size);
+		}
+	}
+}
+
+static void mempool_free_vp(void *element, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	vpfree(element, size);
+}
+
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+
+	return vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+	return size < PAGE_SIZE
+		? mempool_init_kmalloc_pool(pool, min_nr, size)
+		: mempool_init(pool, min_nr, mempool_alloc_vp,
+			       mempool_free_vp, (void *) size);
+}
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	/* access to pcpu vars has to be blocked by other locking */
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
new file mode 100644
index 000000000000..b701f7fe0784
--- /dev/null
+++ b/fs/bcachefs/util.h
@@ -0,0 +1,834 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_UTIL_H
+#define _BCACHEFS_UTIL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/closure.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kernel.h>
+#include <linux/sched/clock.h>
+#include <linux/llist.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "mean_and_variance.h"
+
+#include "darray.h"
+
+struct closure;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define CPU_BIG_ENDIAN		0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CPU_BIG_ENDIAN		1
+#endif
+
+/* type hackery */
+
+#define type_is_exact(_val, _type)					\
+	__builtin_types_compatible_p(typeof(_val), _type)
+
+#define type_is(_val, _type)						\
+	(__builtin_types_compatible_p(typeof(_val), _type) ||		\
+	 __builtin_types_compatible_p(typeof(_val), const _type))
+
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
+static inline size_t buf_pages(void *p, size_t len)
+{
+	return DIV_ROUND_UP(len +
+			    ((unsigned long) p & (PAGE_SIZE - 1)),
+			    PAGE_SIZE);
+}
+
+static inline void vpfree(void *p, size_t size)
+{
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		free_pages((unsigned long) p, get_order(size));
+}
+
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+					 get_order(size)) ?:
+		__vmalloc(size, gfp_mask);
+}
+
+static inline void kvpfree(void *p, size_t size)
+{
+	if (size < PAGE_SIZE)
+		kfree(p);
+	else
+		vpfree(p, size);
+}
+
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
+{
+	return size < PAGE_SIZE
+		? kmalloc(size, gfp_mask)
+		: vpmalloc(size, gfp_mask);
+}
+
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
+
+#define HEAP(type)							\
+struct {								\
+	size_t size, used;						\
+	type *data;							\
+}
+
+#define DECLARE_HEAP(type, name) HEAP(type) name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+				 (gfp));				\
+})
+
+#define free_heap(heap)							\
+do {									\
+	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_set_backpointer(h, i, _fn)					\
+do {									\
+	void (*fn)(typeof(h), size_t) = _fn;				\
+	if (fn)								\
+		fn(h, i);						\
+} while (0)
+
+#define heap_swap(h, i, j, set_backpointer)				\
+do {									\
+	swap((h)->data[i], (h)->data[j]);				\
+	heap_set_backpointer(h, i, set_backpointer);			\
+	heap_set_backpointer(h, j, set_backpointer);			\
+} while (0)
+
+#define heap_peek(h)							\
+({									\
+	EBUG_ON(!(h)->used);						\
+	(h)->data[0];							\
+})
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define heap_sift_down(h, i, cmp, set_backpointer)			\
+do {									\
+	size_t _c, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _c) {			\
+		_c = _j * 2 + 1;					\
+		if (_c + 1 < (h)->used &&				\
+		    cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0)	\
+			_c++;						\
+									\
+		if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0)		\
+			break;						\
+		heap_swap(h, _c, _j, set_backpointer);			\
+	}								\
+} while (0)
+
+#define heap_sift_up(h, i, cmp, set_backpointer)			\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp(h, (h)->data[i], (h)->data[p]) >= 0)		\
+			break;						\
+		heap_swap(h, i, p, set_backpointer);			\
+		i = p;							\
+	}								\
+} while (0)
+
+#define __heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	size_t _i = (h)->used++;					\
+	(h)->data[_i] = d;						\
+	heap_set_backpointer(h, _i, set_backpointer);			\
+									\
+	heap_sift_up(h, _i, cmp, set_backpointer);			\
+	_i;								\
+})
+
+#define heap_add(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r)								\
+		__heap_add(h, d, cmp, set_backpointer);			\
+	_r;								\
+})
+
+#define heap_add_or_replace(h, new, cmp, set_backpointer)		\
+do {									\
+	if (!heap_add(h, new, cmp, set_backpointer) &&			\
+	    cmp(h, new, heap_peek(h)) >= 0) {				\
+		(h)->data[0] = new;					\
+		heap_set_backpointer(h, 0, set_backpointer);		\
+		heap_sift_down(h, 0, cmp, set_backpointer);		\
+	}								\
+} while (0)
+
+#define heap_del(h, i, cmp, set_backpointer)				\
+do {									\
+	size_t _i = (i);						\
+									\
+	BUG_ON(_i >= (h)->used);					\
+	(h)->used--;							\
+	if ((_i) < (h)->used) {						\
+		heap_swap(h, _i, (h)->used, set_backpointer);		\
+		heap_sift_up(h, _i, cmp, set_backpointer);		\
+		heap_sift_down(h, _i, cmp, set_backpointer);		\
+	}								\
+} while (0)
+
+#define heap_pop(h, d, cmp, set_backpointer)				\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		heap_del(h, 0, cmp, set_backpointer);			\
+	}								\
+	_r;								\
+})
+
+#define heap_resort(heap, cmp, set_backpointer)				\
+do {									\
+	ssize_t _i;							\
+	for (_i = (ssize_t) (heap)->used / 2 -  1; _i >= 0; --_i)	\
+		heap_sift_down(heap, _i, cmp, set_backpointer);		\
+} while (0)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+#include "printbuf.h"
+
+#define prt_vprintf(_out, ...)		bch2_prt_vprintf(_out, __VA_ARGS__)
+#define prt_printf(_out, ...)		bch2_prt_printf(_out, __VA_ARGS__)
+#define printbuf_str(_buf)		bch2_printbuf_str(_buf)
+#define printbuf_exit(_buf)		bch2_printbuf_exit(_buf)
+
+#define printbuf_tabstops_reset(_buf)	bch2_printbuf_tabstops_reset(_buf)
+#define printbuf_tabstop_pop(_buf)	bch2_printbuf_tabstop_pop(_buf)
+#define printbuf_tabstop_push(_buf, _n)	bch2_printbuf_tabstop_push(_buf, _n)
+
+#define printbuf_indent_add(_out, _n)	bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_sub(_out, _n)	bch2_printbuf_indent_sub(_out, _n)
+
+#define prt_newline(_out)		bch2_prt_newline(_out)
+#define prt_tab(_out)			bch2_prt_tab(_out)
+#define prt_tab_rjust(_out)		bch2_prt_tab_rjust(_out)
+
+#define prt_bytes_indented(...)		bch2_prt_bytes_indented(__VA_ARGS__)
+#define prt_u64(_out, _v)		prt_printf(_out, "%llu", (u64) (_v))
+#define prt_human_readable_u64(...)	bch2_prt_human_readable_u64(__VA_ARGS__)
+#define prt_human_readable_s64(...)	bch2_prt_human_readable_s64(__VA_ARGS__)
+#define prt_units_u64(...)		bch2_prt_units_u64(__VA_ARGS__)
+#define prt_units_s64(...)		bch2_prt_units_s64(__VA_ARGS__)
+#define prt_string_option(...)		bch2_prt_string_option(__VA_ARGS__)
+#define prt_bitflags(...)		bch2_prt_bitflags(__VA_ARGS__)
+#define prt_bitflags_vector(...)	bch2_prt_bitflags_vector(__VA_ARGS__)
+
+void bch2_pr_time_units(struct printbuf *, u64);
+void bch2_prt_datetime(struct printbuf *, time64_t);
+
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+	sprintf(out, "%pUb", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
+static inline void pr_uuid(struct printbuf *out, u8 *uuid)
+{
+	char uuid_str[40];
+
+	uuid_unparse_lower(uuid, uuid_str);
+	prt_printf(out, "%s", uuid_str);
+}
+
+int bch2_strtoint_h(const char *, int *);
+int bch2_strtouint_h(const char *, unsigned int *);
+int bch2_strtoll_h(const char *, long long *);
+int bch2_strtoull_h(const char *, unsigned long long *);
+int bch2_strtou64_h(const char *, u64 *);
+
+static inline int bch2_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtoint_h(cp, (int *) res);
+#else
+	return bch2_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch2_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return bch2_strtouint_h(cp, (unsigned int *) res);
+#else
+	return bch2_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	( type_is(*res, int)		? bch2_strtoint_h(cp, (void *) res)\
+	: type_is(*res, long)		? bch2_strtol_h(cp, (void *) res)\
+	: type_is(*res, long long)	? bch2_strtoll_h(cp, (void *) res)\
+	: type_is(*res, unsigned)	? bch2_strtouint_h(cp, (void *) res)\
+	: type_is(*res, unsigned long)	? bch2_strtoul_h(cp, (void *) res)\
+	: type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
+	: -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define strtoul_safe_restrict(cp, var, min, max)			\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r && _v >= min && _v <= max)				\
+		var = _v;						\
+	else								\
+		_r = -EINVAL;						\
+	_r;								\
+})
+
+#define snprint(out, var)						\
+	prt_printf(out,							\
+		   type_is(var, int)		? "%i\n"		\
+		 : type_is(var, unsigned)	? "%u\n"		\
+		 : type_is(var, long)		? "%li\n"		\
+		 : type_is(var, unsigned long)	? "%lu\n"		\
+		 : type_is(var, s64)		? "%lli\n"		\
+		 : type_is(var, u64)		? "%llu\n"		\
+		 : type_is(var, char *)		? "%s\n"		\
+		 : "%i\n", var)
+
+bool bch2_is_zero(const void *, size_t);
+
+u64 bch2_read_flag_list(char *, const char * const[]);
+
+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines);
+
+typedef DARRAY(unsigned long) bch_stacktrace;
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct bch2_quantiles {
+	struct bch2_quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct bch2_time_stat_buffer {
+	unsigned	nr;
+	struct bch2_time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
+struct bch2_time_stats {
+	spinlock_t	lock;
+	/* all fields are in nanoseconds */
+	u64		max_duration;
+	u64             min_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	struct bch2_quantiles quantiles;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance	  freq_stats;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct bch2_time_stat_buffer __percpu *buffer;
+};
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+#endif
+
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+#define ewma_add(ewma, val, weight)					\
+({									\
+	typeof(ewma) _ewma = (ewma);					\
+	typeof(weight) _weight = (weight);				\
+									\
+	(((_ewma << _weight) - _ewma) + (val)) >> _weight;		\
+})
+
+struct bch_ratelimit {
+	/* Next time we want to do some work, in nanoseconds */
+	u64			next;
+
+	/*
+	 * Rate at which we want to do work, in units per nanosecond
+	 * The units here correspond to the units passed to
+	 * bch2_ratelimit_increment()
+	 */
+	unsigned		rate;
+};
+
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
+
+struct bch_pd_controller {
+	struct bch_ratelimit	rate;
+	unsigned long		last_update;
+
+	s64			last_actual;
+	s64			smoothed_derivative;
+
+	unsigned		p_term_inverse;
+	unsigned		d_smooth;
+	unsigned		d_term;
+
+	/* for exporting to sysfs (no effect on behavior) */
+	s64			last_derivative;
+	s64			last_proportional;
+	s64			last_change;
+	s64			last_target;
+
+	/*
+	 * If true, the rate will not increase if bch2_ratelimit_delay()
+	 * is not being called often enough.
+	 */
+	bool			backpressure;
+};
+
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
+void bch2_pd_controller_init(struct bch_pd_controller *);
+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
+
+#define sysfs_pd_controller_attribute(name)				\
+	rw_attribute(name##_rate);					\
+	rw_attribute(name##_rate_bytes);				\
+	rw_attribute(name##_rate_d_term);				\
+	rw_attribute(name##_rate_p_term_inverse);			\
+	read_attribute(name##_rate_debug)
+
+#define sysfs_pd_controller_files(name)					\
+	&sysfs_##name##_rate,						\
+	&sysfs_##name##_rate_bytes,					\
+	&sysfs_##name##_rate_d_term,					\
+	&sysfs_##name##_rate_p_term_inverse,				\
+	&sysfs_##name##_rate_debug
+
+#define sysfs_pd_controller_show(name, var)				\
+do {									\
+	sysfs_hprint(name##_rate,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_bytes,		(var)->rate.rate);	\
+	sysfs_print(name##_rate_d_term,		(var)->d_term);		\
+	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
+									\
+	if (attr == &sysfs_##name##_rate_debug)				\
+		bch2_pd_controller_debug_to_text(out, var);		\
+} while (0)
+
+#define sysfs_pd_controller_store(name, var)				\
+do {									\
+	sysfs_strtoul_clamp(name##_rate,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul_clamp(name##_rate_bytes,				\
+			    (var)->rate.rate, 1, UINT_MAX);		\
+	sysfs_strtoul(name##_rate_d_term,	(var)->d_term);		\
+	sysfs_strtoul_clamp(name##_rate_p_term_inverse,			\
+			    (var)->p_term_inverse, 1, INT_MAX);		\
+} while (0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+void bch2_bio_map(struct bio *bio, void *base, size_t);
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl)					\
+do {									\
+	closure_get(cl);						\
+	submit_bio(bio);						\
+} while (0)
+
+#define kthread_wait(cond)						\
+({									\
+	int _ret = 0;							\
+									\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+#define kthread_wait_freezable(cond)					\
+({									\
+	int _ret = 0;							\
+	while (1) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (kthread_should_stop()) {				\
+			_ret = -1;					\
+			break;						\
+		}							\
+									\
+		if (cond)						\
+			break;						\
+									\
+		schedule();						\
+		try_to_freeze();					\
+	}								\
+	set_current_state(TASK_RUNNING);				\
+	_ret;								\
+})
+
+size_t bch2_rand_range(size_t);
+
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+
+static inline void memcpy_u64s_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+}
+
+static inline void __memcpy_u64s(void *dst, const void *src,
+				 unsigned u64s)
+{
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+
+	asm volatile("rep ; movsq"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	u64 *d = dst;
+	const u64 *s = src;
+
+	while (u64s--)
+		*d++ = *s++;
+#endif
+}
+
+static inline void memcpy_u64s(void *dst, const void *src,
+			       unsigned u64s)
+{
+	EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
+		 dst + u64s * sizeof(u64) <= src));
+
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down(void *dst, const void *src,
+				       unsigned u64s)
+{
+	__memcpy_u64s(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_down_small(void *dst, const void *src,
+				       unsigned u64s)
+{
+	memcpy_u64s_small(dst, src, u64s);
+}
+
+static inline void memmove_u64s_down_small(void *dst, const void *src,
+				     unsigned u64s)
+{
+	EBUG_ON(dst > src);
+
+	__memmove_u64s_down_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+					   unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s;
+	u64 *src = (u64 *) _src + u64s;
+
+	while (u64s--)
+		*--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+					 unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up_small(dst, src, u64s);
+}
+
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
+				     unsigned u64s)
+{
+	u64 *dst = (u64 *) _dst + u64s - 1;
+	u64 *src = (u64 *) _src + u64s - 1;
+
+#ifdef CONFIG_X86_64
+	long d0, d1, d2;
+
+	asm volatile("std ;\n"
+		     "rep ; movsq\n"
+		     "cld ;\n"
+		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		     : "0" (u64s), "1" (dst), "2" (src)
+		     : "memory");
+#else
+	while (u64s--)
+		*dst-- = *src--;
+#endif
+}
+
+static inline void memmove_u64s_up(void *dst, const void *src,
+				   unsigned u64s)
+{
+	EBUG_ON(dst < src);
+
+	__memmove_u64s_up(dst, src, u64s);
+}
+
+static inline void memmove_u64s(void *dst, const void *src,
+				unsigned u64s)
+{
+	if (dst < src)
+		__memmove_u64s_down(dst, src, u64s);
+	else
+		__memmove_u64s_up(dst, src, u64s);
+}
+
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
+{
+	unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
+
+	memset(s + bytes, c, rem);
+}
+
+void sort_cmp_size(void *base, size_t num, size_t size,
+	  int (*cmp_func)(const void *, const void *, size_t),
+	  void (*swap_func)(void *, void *, size_t));
+
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
+static inline void __move_gap(void *array, size_t element_size,
+			      size_t nr, size_t size,
+			      size_t old_gap, size_t new_gap)
+{
+	size_t gap_end = old_gap + size - nr;
+
+	if (new_gap < old_gap) {
+		size_t move = old_gap - new_gap;
+
+		memmove(array + element_size * (gap_end - move),
+			array + element_size * (old_gap - move),
+				element_size * move);
+	} else if (new_gap > old_gap) {
+		size_t move = new_gap - old_gap;
+
+		memmove(array + element_size * old_gap,
+			array + element_size * gap_end,
+				element_size * move);
+	}
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap)	\
+	__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
+#define bubble_sort(_base, _nr, _cmp)					\
+do {									\
+	ssize_t _i, _last;						\
+	bool _swapped = true;						\
+									\
+	for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
+		_swapped = false;					\
+		for (_i = 0; _i < _last; _i++)				\
+			if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {	\
+				swap((_base)[_i], (_base)[_i + 1]);	\
+				_swapped = true;			\
+			}						\
+	}								\
+} while (0)
+
+static inline u64 percpu_u64_get(u64 __percpu *src)
+{
+	u64 ret = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		ret += *per_cpu_ptr(src, cpu);
+	return ret;
+}
+
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dst, cpu) = 0;
+	this_cpu_write(*dst, src);
+}
+
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
+{
+	unsigned i;
+
+	for (i = 0; i < nr; i++)
+		acc[i] += src[i];
+}
+
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
+				   unsigned nr)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
+}
+
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(per_cpu_ptr(p, cpu), c, bytes);
+}
+
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+static inline int u8_cmp(u8 l, u8 r)
+{
+	return cmp_int(l, r);
+}
+
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
+#include <linux/uuid.h>
+
+#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
new file mode 100644
index 000000000000..cb4f33ed9ab3
--- /dev/null
+++ b/fs/bcachefs/varint.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/math.h>
+#include <linux/string.h>
+#include <asm/unaligned.h>
+
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
+#include "varint.h"
+
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ */
+int bch2_varint_encode(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+	__le64 v_le;
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+		v_le = cpu_to_le64(v);
+		memcpy(out, &v_le, bytes);
+	} else {
+		*out++ = 255;
+		bytes = 9;
+		put_unaligned_le64(v, out);
+	}
+
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+	unsigned bytes = likely(in < end)
+		? ffz(*in & 255) + 1
+		: 1;
+	u64 v;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		__le64 v_le = 0;
+
+		memcpy(&v_le, in, bytes);
+		v = le64_to_cpu(v_le);
+		v >>= bytes;
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:	destination to encode to
+ * @v:		unsigned integer to encode
+ * Returns:	size in bytes of the encoded integer - at most 9 bytes
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+	unsigned bits = fls64(v|1);
+	unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+	if (likely(bytes < 9)) {
+		v <<= bytes;
+		v |= ~(~0 << (bytes - 1));
+	} else {
+		*out++ = 255;
+		bytes = 9;
+	}
+
+	put_unaligned_le64(v, out);
+	return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:		varint to decode
+ * @end:	end of buffer to decode from
+ * @out:	on success, decoded integer
+ * Returns:	size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
+{
+#ifdef CONFIG_VALGRIND
+	VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
+	u64 v = get_unaligned_le64(in);
+	unsigned bytes = ffz(*in) + 1;
+
+	if (unlikely(in + bytes > end))
+		return -1;
+
+	if (likely(bytes < 9)) {
+		v >>= bytes;
+		v &= ~(~0ULL << (7 * bytes));
+	} else {
+		v = get_unaligned_le64(++in);
+	}
+
+	*out = v;
+	return bytes;
+}
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
new file mode 100644
index 000000000000..92a182fb3d7a
--- /dev/null
+++ b/fs/bcachefs/varint.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
new file mode 100644
index 000000000000..a6561b4b36a6
--- /dev/null
+++ b/fs/bcachefs/vstructs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)						\
+({									\
+	( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)		\
+	: type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)		\
+	: type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)		\
+	: ((__force u8) ((_s)->u64s)));						\
+})
+
+#define __vstruct_bytes(_type, _u64s)					\
+({									\
+	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
+									\
+	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
+})
+
+#define vstruct_bytes(_s)						\
+	__vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)		\
+	(round_up(__vstruct_bytes(_type, _u64s),			\
+		  512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)				\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)		\
+	__vstruct_blocks(typeof(*(_s)), _sector_block_bits,		\
+			 __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)				\
+	(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)						\
+	((typeof(_s))			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)						\
+	((typeof(&(_s)->start[0]))	((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)							\
+	((void *)			((u64 *) (_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)					\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s);					\
+	     _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)				\
+	for (_i = (_s)->start;						\
+	     _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);	\
+	     _i = _t)
+
+#define vstruct_idx(_s, _idx)						\
+	((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
new file mode 100644
index 000000000000..5a1858fb9879
--- /dev/null
+++ b/fs/bcachefs/xattr.c
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "acl.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "rebalance.h"
+#include "str_hash.h"
+#include "xattr.h"
+
+#include <linux/dcache.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
+			  const struct xattr_search_key *key)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+	bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
+
+	return bch2_str_hash_end(&ctx, info);
+}
+
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_xattr_hash(info, key);
+}
+
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
+
+	return bch2_xattr_hash(info,
+		 &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+}
+
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	const struct xattr_search_key *r = _r;
+
+	return l.v->x_type != r->type ||
+		l.v->x_name_len != r->name.len ||
+		memcmp(l.v->x_name, r->name.name, r->name.len);
+}
+
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
+	struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
+
+	return l.v->x_type != r.v->x_type ||
+		l.v->x_name_len != r.v->x_name_len ||
+		memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+}
+
+const struct bch_hash_desc bch2_xattr_hash_desc = {
+	.btree_id	= BTREE_ID_xattrs,
+	.key_type	= KEY_TYPE_xattr,
+	.hash_key	= xattr_hash_key,
+	.hash_bkey	= xattr_hash_bkey,
+	.cmp_key	= xattr_cmp_key,
+	.cmp_bkey	= xattr_cmp_bkey,
+};
+
+int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
+		       enum bkey_invalid_flags flags,
+		       struct printbuf *err)
+{
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+	unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+					   le16_to_cpu(xattr.v->x_val_len));
+	int ret = 0;
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
+			 xattr_val_size_too_small,
+			 "value too small (%zu < %u)",
+			 bkey_val_u64s(k.k), val_u64s);
+
+	/* XXX why +4 ? */
+	val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+				  le16_to_cpu(xattr.v->x_val_len) + 4);
+
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
+			 xattr_val_size_too_big,
+			 "value too big (%zu > %u)",
+			 bkey_val_u64s(k.k), val_u64s);
+
+	bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
+			 xattr_invalid_type,
+			 "invalid type (%u)", xattr.v->x_type);
+
+	bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
+			 xattr_name_invalid_chars,
+			 "xattr name has invalid characters");
+fsck_err:
+	return ret;
+}
+
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
+			struct bkey_s_c k)
+{
+	const struct xattr_handler *handler;
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+
+	handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+	if (handler && handler->prefix)
+		prt_printf(out, "%s", handler->prefix);
+	else if (handler)
+		prt_printf(out, "(type %u)", xattr.v->x_type);
+	else
+		prt_printf(out, "(unknown type %u)", xattr.v->x_type);
+
+	prt_printf(out, "%.*s:%.*s",
+	       xattr.v->x_name_len,
+	       xattr.v->x_name,
+	       le16_to_cpu(xattr.v->x_val_len),
+	       (char *) xattr_val(xattr.v));
+
+	if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+	    xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+		prt_char(out, ' ');
+		bch2_acl_to_text(out, xattr_val(xattr.v),
+				 le16_to_cpu(xattr.v->x_val_len));
+	}
+}
+
+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
+				const char *name, void *buffer, size_t size, int type)
+{
+	struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
+	struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
+	struct btree_iter iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_s_c k;
+	int ret;
+
+	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+			       inode_inum(inode), &search, 0);
+	if (ret)
+		goto err1;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err2;
+
+	xattr = bkey_s_c_to_xattr(k);
+	ret = le16_to_cpu(xattr.v->x_val_len);
+	if (buffer) {
+		if (ret > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, xattr_val(xattr.v), ret);
+	}
+err2:
+	bch2_trans_iter_exit(trans, &iter);
+err1:
+	return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret;
+}
+
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
+		   struct bch_inode_unpacked *inode_u,
+		   const struct bch_hash_info *hash_info,
+		   const char *name, const void *value, size_t size,
+		   int type, int flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = { NULL };
+	int ret;
+
+	ret   = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
+		bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
+	if (ret)
+		return ret;
+
+	inode_u->bi_ctime = bch2_current_time(c);
+
+	ret = bch2_inode_write(trans, &inode_iter, inode_u);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret)
+		return ret;
+
+	if (value) {
+		struct bkey_i_xattr *xattr;
+		unsigned namelen = strlen(name);
+		unsigned u64s = BKEY_U64s +
+			xattr_val_u64s(namelen, size);
+
+		if (u64s > U8_MAX)
+			return -ERANGE;
+
+		xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		bkey_xattr_init(&xattr->k_i);
+		xattr->k.u64s		= u64s;
+		xattr->v.x_type		= type;
+		xattr->v.x_name_len	= namelen;
+		xattr->v.x_val_len	= cpu_to_le16(size);
+		memcpy(xattr->v.x_name, name, namelen);
+		memcpy(xattr_val(&xattr->v), value, size);
+
+		ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+			      inum, &xattr->k_i,
+			      (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+			      (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(type, name, strlen(name));
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+				       hash_info, inum, &search);
+	}
+
+	if (bch2_err_matches(ret, ENOENT))
+		ret = flags & XATTR_REPLACE ? -ENODATA : 0;
+
+	return ret;
+}
+
+struct xattr_buf {
+	char		*buf;
+	size_t		len;
+	size_t		used;
+};
+
+static int __bch2_xattr_emit(const char *prefix,
+			     const char *name, size_t name_len,
+			     struct xattr_buf *buf)
+{
+	const size_t prefix_len = strlen(prefix);
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (buf->buf) {
+		if (buf->used + total_len > buf->len)
+			return -ERANGE;
+
+		memcpy(buf->buf + buf->used, prefix, prefix_len);
+		memcpy(buf->buf + buf->used + prefix_len,
+		       name, name_len);
+		buf->buf[buf->used + prefix_len + name_len] = '\0';
+	}
+
+	buf->used += total_len;
+	return 0;
+}
+
+static int bch2_xattr_emit(struct dentry *dentry,
+			    const struct bch_xattr *xattr,
+			    struct xattr_buf *buf)
+{
+	const struct xattr_handler *handler =
+		bch2_xattr_type_to_handler(xattr->x_type);
+
+	return handler && (!handler->list || handler->list(dentry))
+		? __bch2_xattr_emit(handler->prefix ?: handler->name,
+				    xattr->x_name, xattr->x_name_len, buf)
+		: 0;
+}
+
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
+				    struct bch_inode_unpacked *inode,
+				    struct xattr_buf *buf,
+				    bool all)
+{
+	const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
+	unsigned id;
+	int ret = 0;
+	u64 v;
+
+	for (id = 0; id < Inode_opt_nr; id++) {
+		v = bch2_inode_opt_get(inode, id);
+		if (!v)
+			continue;
+
+		if (!all &&
+		    !(inode->bi_fields_set & (1 << id)))
+			continue;
+
+		ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
+					strlen(bch2_inode_opts[id]), buf);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct bch_fs *c = dentry->d_sb->s_fs_info;
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
+	u64 offset = 0, inum = inode->ei_inode.bi_inum;
+	u32 snapshot;
+	int ret;
+retry:
+	bch2_trans_begin(trans);
+	iter = (struct btree_iter) { NULL };
+
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	if (ret)
+		goto err;
+
+	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
+			   SPOS(inum, offset, snapshot),
+			   POS(inum, U64_MAX), 0, k, ret) {
+		if (k.k->type != KEY_TYPE_xattr)
+			continue;
+
+		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		if (ret)
+			break;
+	}
+
+	offset = iter.pos.offset;
+	bch2_trans_iter_exit(trans, &iter);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
+
+	bch2_trans_put(trans);
+
+	if (ret)
+		goto out;
+
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
+	if (ret)
+		goto out;
+
+	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
+	if (ret)
+		goto out;
+
+	return buf.used;
+out:
+	return bch2_err_class(ret);
+}
+
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, void *buffer, size_t size)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	int ret = bch2_trans_do(c, NULL, NULL, 0,
+		bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
+
+	return bch2_err_class(ret);
+}
+
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
+				  struct mnt_idmap *idmap,
+				  struct dentry *dentry, struct inode *vinode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
+	struct bch_inode_unpacked inode_u;
+	int ret;
+
+	ret = bch2_trans_run(c,
+		commit_do(trans, NULL, NULL, 0,
+			bch2_xattr_set(trans, inode_inum(inode), &inode_u,
+				       &hash, name, value, size,
+				       handler->flags, flags)) ?:
+		(bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
+
+	return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_USER,
+};
+
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+
+static const struct xattr_handler bch_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= bch2_xattr_trusted_list,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_TRUSTED,
+};
+
+static const struct xattr_handler bch_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= bch2_xattr_get_handler,
+	.set	= bch2_xattr_set_handler,
+	.flags	= KEY_TYPE_XATTR_INDEX_SECURITY,
+};
+
+#ifndef NO_BCACHEFS_FS
+
+static int opt_to_inode_opt(int id)
+{
+	switch (id) {
+#define x(name, ...)				\
+	case Opt_##name: return Inode_opt_##name;
+	BCH_INODE_OPTS()
+#undef  x
+	default:
+		return -1;
+	}
+}
+
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size,
+				bool all)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct bch_opts opts =
+		bch2_inode_opts_to_opts(&inode->ei_inode);
+	const struct bch_option *opt;
+	int id, inode_opt_id;
+	struct printbuf out = PRINTBUF;
+	int ret;
+	u64 v;
+
+	id = bch2_opt_lookup(name);
+	if (id < 0 || !bch2_opt_is_inode_opt(id))
+		return -EINVAL;
+
+	inode_opt_id = opt_to_inode_opt(id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + id;
+
+	if (!bch2_opt_defined_by_id(&opts, id))
+		return -ENODATA;
+
+	if (!all &&
+	    !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
+		return -ENODATA;
+
+	v = bch2_opt_get_by_id(&opts, id);
+	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
+
+	ret = out.pos;
+
+	if (out.allocation_failure) {
+		ret = -ENOMEM;
+	} else if (buffer) {
+		if (out.pos > size)
+			ret = -ERANGE;
+		else
+			memcpy(buffer, out.buf, out.pos);
+	}
+
+	printbuf_exit(&out);
+	return ret;
+}
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, false);
+}
+
+struct inode_opt_set {
+	int			id;
+	u64			v;
+	bool			defined;
+};
+
+static int inode_opt_set_fn(struct btree_trans *trans,
+			    struct bch_inode_info *inode,
+			    struct bch_inode_unpacked *bi,
+			    void *p)
+{
+	struct inode_opt_set *s = p;
+
+	if (s->defined)
+		bi->bi_fields_set |= 1U << s->id;
+	else
+		bi->bi_fields_set &= ~(1U << s->id);
+
+	bch2_inode_opt_set(bi, s->id, s->v);
+
+	return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	struct bch_inode_info *inode = to_bch_ei(vinode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	const struct bch_option *opt;
+	char *buf;
+	struct inode_opt_set s;
+	int opt_id, inode_opt_id, ret;
+
+	opt_id = bch2_opt_lookup(name);
+	if (opt_id < 0)
+		return -EINVAL;
+
+	opt = bch2_opt_table + opt_id;
+
+	inode_opt_id = opt_to_inode_opt(opt_id);
+	if (inode_opt_id < 0)
+		return -EINVAL;
+
+	s.id = inode_opt_id;
+
+	if (value) {
+		u64 v = 0;
+
+		buf = kmalloc(size + 1, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memcpy(buf, value, size);
+		buf[size] = '\0';
+
+		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
+		kfree(buf);
+
+		if (ret < 0)
+			return ret;
+
+		ret = bch2_opt_check_may_set(c, opt_id, v);
+		if (ret < 0)
+			return ret;
+
+		s.v = v + 1;
+		s.defined = true;
+	} else {
+		/*
+		 * Check if this option was set on the parent - if so, switched
+		 * back to inheriting from the parent:
+		 *
+		 * rename() also has to deal with keeping inherited options up
+		 * to date - see bch2_reinherit_attrs()
+		 */
+		spin_lock(&dentry->d_lock);
+		if (!IS_ROOT(dentry)) {
+			struct bch_inode_info *dir =
+				to_bch_ei(d_inode(dentry->d_parent));
+
+			s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
+		} else {
+			s.v = 0;
+		}
+		spin_unlock(&dentry->d_lock);
+
+		s.defined = false;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	if (inode_opt_id == Inode_opt_project) {
+		/*
+		 * inode fields accessible via the xattr interface are stored
+		 * with a +1 bias, so that 0 means unset:
+		 */
+		ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
+		if (ret)
+			goto err;
+	}
+
+	ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+err:
+	mutex_unlock(&inode->ei_update_lock);
+
+	if (value &&
+	    (opt_id == Opt_background_compression ||
+	     opt_id == Opt_background_target))
+		bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
+
+	return bch2_err_class(ret);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+	.prefix	= "bcachefs.",
+	.get	= bch2_xattr_bcachefs_get,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+static int bch2_xattr_bcachefs_get_effective(
+				const struct xattr_handler *handler,
+				struct dentry *dentry, struct inode *vinode,
+				const char *name, void *buffer, size_t size)
+{
+	return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
+					 name, buffer, size, true);
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
+	.prefix	= "bcachefs_effective.",
+	.get	= bch2_xattr_bcachefs_get_effective,
+	.set	= bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
+const struct xattr_handler *bch2_xattr_handlers[] = {
+	&bch_xattr_user_handler,
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+	&nop_posix_acl_access,
+	&nop_posix_acl_default,
+#endif
+	&bch_xattr_trusted_handler,
+	&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+	&bch_xattr_bcachefs_handler,
+	&bch_xattr_bcachefs_effective_handler,
+#endif
+	NULL
+};
+
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+	[KEY_TYPE_XATTR_INDEX_USER]			= &bch_xattr_user_handler,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS]	=
+		&nop_posix_acl_access,
+	[KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT]	=
+		&nop_posix_acl_default,
+	[KEY_TYPE_XATTR_INDEX_TRUSTED]		= &bch_xattr_trusted_handler,
+	[KEY_TYPE_XATTR_INDEX_SECURITY]		= &bch_xattr_security_handler,
+};
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
+{
+	return type < ARRAY_SIZE(bch_xattr_handler_map)
+		? bch_xattr_handler_map[type]
+		: NULL;
+}
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
new file mode 100644
index 000000000000..1337f31a5c49
--- /dev/null
+++ b/fs/bcachefs/xattr.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_H
+#define _BCACHEFS_XATTR_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
+
+int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
+		       enum bkey_invalid_flags, struct printbuf *);
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
+	.key_invalid	= bch2_xattr_invalid,		\
+	.val_to_text	= bch2_xattr_to_text,		\
+	.min_val_size	= 8,				\
+})
+
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+			    name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr)					\
+	((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+
+struct xattr_search_key {
+	u8		type;
+	struct qstr	name;
+};
+
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key)	\
+	{ .type = _type, .name = QSTR_INIT(_name, _len) })
+
+struct dentry;
+struct xattr_handler;
+struct bch_hash_info;
+struct bch_inode_info;
+
+/* Exported for cmd_migrate.c in tools: */
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+		   struct bch_inode_unpacked *, const struct bch_hash_info *,
+		   const char *, const void *, size_t, int, int);
+
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
+
+extern const struct xattr_handler *bch2_xattr_handlers[];
+
+#endif /* _BCACHEFS_XATTR_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9a16a51fbb88..a93d76df8ed8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = {
 };
 
 static const struct export_operations befs_export_operations = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= befs_fh_to_dentry,
 	.fh_to_parent	= befs_fh_to_parent,
 	.get_parent	= befs_get_parent,
@@ -360,11 +361,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	 * for indexing purposes. (PFD, page 54)
 	 */
 
-	inode->i_mtime.tv_sec =
-	    fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
-	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
-	inode->i_atime = inode->i_mtime;
+	inode_set_mtime(inode,
+			fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16,
+			0);/* lower 16 bits are not a time */
+	inode_set_ctime_to_ts(inode, inode_get_mtime(inode));
+	inode_set_atime_to_ts(inode, inode_get_mtime(inode));
 
 	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
 	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 12b8af04dcb3..fbc4ae80a4b2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	set_bit(ino, info->si_imap);
 	info->si_freei--;
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
 	inode->i_fop = &bfs_file_operations;
@@ -187,7 +187,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 	de->ino = 0;
 	mark_buffer_dirty_inode(bh, dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
 	inode_dec_link_count(inode);
@@ -240,7 +240,7 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 			goto end_rename;
 	}
 	old_de->ino = 0;
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 	mark_inode_dirty(old_dir);
 	if (new_inode) {
 		inode_set_ctime_current(new_inode);
@@ -294,7 +294,8 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
 					dir->i_size += BFS_DIRENT_SIZE;
 					inode_set_ctime_current(dir);
 				}
-				dir->i_mtime = inode_set_ctime_current(dir);
+				inode_set_mtime_to_ts(dir,
+						      inode_set_ctime_current(dir));
 				mark_inode_dirty(dir);
 				de->ino = cpu_to_le16((u16)ino);
 				for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e6a76ae9eb44..355957dbce39 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -80,11 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	set_nlink(inode, le32_to_cpu(di->i_nlink));
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
-	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
-	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
+	inode_set_atime(inode, le32_to_cpu(di->i_atime), 0);
+	inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0);
 	inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 
 	brelse(bh);
 	unlock_new_inode(inode);
@@ -140,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	di->i_uid = cpu_to_le32(i_uid_read(inode));
 	di->i_gid = cpu_to_le32(i_gid_read(inode));
 	di->i_nlink = cpu_to_le32(inode->i_nlink);
-	di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
-	di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
+	di->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+	di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
+	di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
 	i_sblock = BFS_I(inode)->i_sblock;
 	di->i_sblock = cpu_to_le32(i_sblock);
 	di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7b3d2d491407..5397b552fbeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
 
-static int set_brk(unsigned long start, unsigned long end, int prot)
-{
-	start = ELF_PAGEALIGN(start);
-	end = ELF_PAGEALIGN(end);
-	if (end > start) {
-		/*
-		 * Map the last of the bss segment.
-		 * If the header is requesting these pages to be
-		 * executable, honour that (ppc32 needs this).
-		 */
-		int error = vm_brk_flags(start, end - start,
-				prot & PROT_EXEC ? VM_EXEC : 0);
-		if (error)
-			return error;
-	}
-	current->mm->start_brk = current->mm->brk = end;
-	return 0;
-}
-
-/* We need to explicitly zero any fractional pages
-   after the data section (i.e. bss).  This would
-   contain the junk from the file that should not
-   be in memory
+/*
+ * We need to explicitly zero any trailing portion of the page that follows
+ * p_filesz when it ends before the page ends (e.g. bss), otherwise this
+ * memory will contain the junk from the file that should not be present.
  */
-static int padzero(unsigned long elf_bss)
+static int padzero(unsigned long address)
 {
 	unsigned long nbyte;
 
-	nbyte = ELF_PAGEOFFSET(elf_bss);
+	nbyte = ELF_PAGEOFFSET(address);
 	if (nbyte) {
 		nbyte = ELF_MIN_ALIGN - nbyte;
-		if (clear_user((void __user *) elf_bss, nbyte))
+		if (clear_user((void __user *)address, nbyte))
 			return -EFAULT;
 	}
 	return 0;
@@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	return 0;
 }
 
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". (Note that p_filesz is rounded up to the
+ * next page, so any extra bytes from the file must be wiped.)
+ */
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		const struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	return(map_addr);
 }
 
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". Memory from "p_filesz" through "p_memsz"
+ * rounded up to the next page is zeroed.
+ */
+static unsigned long elf_load(struct file *filep, unsigned long addr,
+		const struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
+{
+	unsigned long zero_start, zero_end;
+	unsigned long map_addr;
+
+	if (eppnt->p_filesz) {
+		map_addr = elf_map(filep, addr, eppnt, prot, type, total_size);
+		if (BAD_ADDR(map_addr))
+			return map_addr;
+		if (eppnt->p_memsz > eppnt->p_filesz) {
+			zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+				eppnt->p_filesz;
+			zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+				eppnt->p_memsz;
+
+			/*
+			 * Zero the end of the last mapped page but ignore
+			 * any errors if the segment isn't writable.
+			 */
+			if (padzero(zero_start) && (prot & PROT_WRITE))
+				return -EFAULT;
+		}
+	} else {
+		map_addr = zero_start = ELF_PAGESTART(addr);
+		zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+			eppnt->p_memsz;
+	}
+	if (eppnt->p_memsz > eppnt->p_filesz) {
+		/*
+		 * Map the last of the segment.
+		 * If the header is requesting these pages to be
+		 * executable, honour that (ppc32 needs this).
+		 */
+		int error;
+
+		zero_start = ELF_PAGEALIGN(zero_start);
+		zero_end = ELF_PAGEALIGN(zero_end);
+
+		error = vm_brk_flags(zero_start, zero_end - zero_start,
+				     prot & PROT_EXEC ? VM_EXEC : 0);
+		if (error)
+			map_addr = error;
+	}
+	return map_addr;
+}
+
+
 static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
 {
 	elf_addr_t min_addr = -1;
@@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	struct elf_phdr *eppnt;
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
-	unsigned long last_bss = 0, elf_bss = 0;
-	int bss_prot = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
 	int i;
@@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			else if (no_base && interp_elf_ex->e_type == ET_DYN)
 				load_addr = -vaddr;
 
-			map_addr = elf_map(interpreter, load_addr + vaddr,
+			map_addr = elf_load(interpreter, load_addr + vaddr,
 					eppnt, elf_prot, elf_type, total_size);
 			total_size = 0;
 			error = map_addr;
@@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				error = -ENOMEM;
 				goto out;
 			}
-
-			/*
-			 * Find the end of the file mapping for this phdr, and
-			 * keep track of the largest address we see for this.
-			 */
-			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
-			if (k > elf_bss)
-				elf_bss = k;
-
-			/*
-			 * Do the same thing for the memory mapping - between
-			 * elf_bss and last_bss is the bss section.
-			 */
-			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-			if (k > last_bss) {
-				last_bss = k;
-				bss_prot = elf_prot;
-			}
 		}
 	}
 
-	/*
-	 * Now fill out the bss section: first pad the last page from
-	 * the file up to the page boundary, and zero it from elf_bss
-	 * up to the end of the page.
-	 */
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out;
-	}
-	/*
-	 * Next, align both the file and mem bss up to the page size,
-	 * since this is where elf_bss was just zeroed up to, and where
-	 * last_bss will end after the vm_brk_flags() below.
-	 */
-	elf_bss = ELF_PAGEALIGN(elf_bss);
-	last_bss = ELF_PAGEALIGN(last_bss);
-	/* Finally, if there is still more bss to allocate, do it. */
-	if (last_bss > elf_bss) {
-		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
-				bss_prot & PROT_EXEC ? VM_EXEC : 0);
-		if (error)
-			goto out;
-	}
-
 	error = load_addr;
 out:
 	return error;
@@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	struct elf_phdr *elf_property_phdata = NULL;
-	unsigned long elf_bss, elf_brk;
-	int bss_prot = 0;
+	unsigned long elf_brk;
 	int retval, i;
 	unsigned long elf_entry;
 	unsigned long e_entry;
@@ -1020,7 +1015,6 @@ out_free_interp:
 	if (retval < 0)
 		goto out_free_dentry;
 
-	elf_bss = 0;
 	elf_brk = 0;
 
 	start_code = ~0UL;
@@ -1040,33 +1034,6 @@ out_free_interp:
 		if (elf_ppnt->p_type != PT_LOAD)
 			continue;
 
-		if (unlikely (elf_brk > elf_bss)) {
-			unsigned long nbyte;
-
-			/* There was a PT_LOAD segment with p_memsz > p_filesz
-			   before this one. Map anonymous pages, if needed,
-			   and clear the area.  */
-			retval = set_brk(elf_bss + load_bias,
-					 elf_brk + load_bias,
-					 bss_prot);
-			if (retval)
-				goto out_free_dentry;
-			nbyte = ELF_PAGEOFFSET(elf_bss);
-			if (nbyte) {
-				nbyte = ELF_MIN_ALIGN - nbyte;
-				if (nbyte > elf_brk - elf_bss)
-					nbyte = elf_brk - elf_bss;
-				if (clear_user((void __user *)elf_bss +
-							load_bias, nbyte)) {
-					/*
-					 * This bss-zeroing can fail if the ELF
-					 * file specifies odd protections. So
-					 * we don't check the return value
-					 */
-				}
-			}
-		}
-
 		elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
 				     !!interpreter, false);
 
@@ -1162,7 +1129,7 @@ out_free_interp:
 			}
 		}
 
-		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+		error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
 				elf_prot, elf_flags, total_size);
 		if (BAD_ADDR(error)) {
 			retval = IS_ERR_VALUE(error) ?
@@ -1210,40 +1177,24 @@ out_free_interp:
 
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
 
-		if (k > elf_bss)
-			elf_bss = k;
 		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
 			end_code = k;
 		if (end_data < k)
 			end_data = k;
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
-		if (k > elf_brk) {
-			bss_prot = elf_prot;
+		if (k > elf_brk)
 			elf_brk = k;
-		}
 	}
 
 	e_entry = elf_ex->e_entry + load_bias;
 	phdr_addr += load_bias;
-	elf_bss += load_bias;
 	elf_brk += load_bias;
 	start_code += load_bias;
 	end_code += load_bias;
 	start_data += load_bias;
 	end_data += load_bias;
 
-	/* Calling set_brk effectively mmaps the pages that we need
-	 * for the bss and break sections.  We must do this before
-	 * mapping in the interpreter, to make sure it doesn't wind
-	 * up getting placed where the bss needs to go.
-	 */
-	retval = set_brk(elf_bss, elf_brk, bss_prot);
-	if (retval)
-		goto out_free_dentry;
-	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
-		retval = -EFAULT; /* Nobody gets to see this, but.. */
-		goto out_free_dentry;
-	}
+	current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
 
 	if (interpreter) {
 		elf_entry = load_elf_interp(interp_elf_ex,
@@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
-	unsigned long elf_bss, bss, len;
 	int retval, error, i, j;
 	struct elfhdr elf_ex;
 
@@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file)
 		eppnt++;
 
 	/* Now use mmap to map the library into memory. */
-	error = vm_mmap(file,
-			ELF_PAGESTART(eppnt->p_vaddr),
-			(eppnt->p_filesz +
-			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
+	error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
+			eppnt,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
 			MAP_FIXED_NOREPLACE | MAP_PRIVATE,
-			(eppnt->p_offset -
-			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
+			0);
+
 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
 		goto out_free_ph;
 
-	elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
-	if (padzero(elf_bss)) {
-		error = -EFAULT;
-		goto out_free_ph;
-	}
-
-	len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr);
-	bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr);
-	if (bss > len) {
-		error = vm_brk(len, bss - len);
-		if (error)
-			goto out_free_ph;
-	}
 	error = 0;
 
 out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 206812ce544a..fefc642541cb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -899,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 	kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
 	seg = loadmap->segs;
 	for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
-		kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+		kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]",
 		       loop,
-		       seg->addr, seg->addr + seg->p_memsz - 1,
-		       seg->p_vaddr, seg->p_memsz);
+		       (unsigned long long) seg->addr,
+		       (unsigned long long) seg->addr + seg->p_memsz - 1,
+		       (unsigned long long) seg->p_vaddr,
+		       (unsigned long long) seg->p_memsz);
 
 	return 0;
 
@@ -1081,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
 				phdr->p_offset - disp);
 
-		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
-		       loop, phdr->p_memsz + disp, prot, flags,
-		       phdr->p_offset - disp, maddr);
+		kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx",
+		       loop, (unsigned long long) phdr->p_memsz + disp,
+		       prot, flags, (unsigned long long) phdr->p_offset - disp,
+		       maddr);
 
 		if (IS_ERR_VALUE(maddr))
 			return (int) maddr;
@@ -1145,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 
 #else
 		if (excess > 0) {
-			kdebug("clear[%d] ad=%lx sz=%lx",
-			       loop, maddr + phdr->p_filesz, excess);
+			kdebug("clear[%d] ad=%llx sz=%lx", loop,
+			       (unsigned long long) maddr + phdr->p_filesz,
+			       excess);
 			if (clear_user((void *) maddr + phdr->p_filesz, excess))
 				return -EFAULT;
 		}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e0108d17b085..68fa225f89e5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
 #define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -60,12 +57,10 @@ typedef struct {
 	char *name;
 	struct dentry *dentry;
 	struct file *interp_file;
+	refcount_t users;		/* sync removal with load_misc_binary() */
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
 
 /*
  * Max length of the register string.  Determined by:
@@ -82,19 +77,24 @@ static int entry_count;
  */
 #define MAX_REGISTER_LENGTH 1920
 
-/*
- * Check if we support the binfmt
- * if we do, return the node, else NULL
- * locking is done in load_misc_binary
+/**
+ * search_binfmt_handler - search for a binary handler for @bprm
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Search for a binary type handler for @bprm in the list of registered binary
+ * type handlers.
+ *
+ * Return: binary type list entry on success, NULL on failure
  */
-static Node *check_file(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+				   struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
-	struct list_head *l;
+	Node *e;
 
 	/* Walk all the registered handlers. */
-	list_for_each(l, &entries) {
-		Node *e = list_entry(l, Node, list);
+	list_for_each_entry(e, &misc->entries, list) {
 		char *s;
 		int j;
 
@@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm)
 		if (j == e->size)
 			return e;
 	}
+
 	return NULL;
 }
 
+/**
+ * get_binfmt_handler - try to find a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Try to find a binfmt handler for the binary type. If one is found take a
+ * reference to protect against removal via bm_{entry,status}_write().
+ *
+ * Return: binary type list entry on success, NULL on failure
+ */
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+				struct linux_binprm *bprm)
+{
+	Node *e;
+
+	read_lock(&misc->entries_lock);
+	e = search_binfmt_handler(misc, bprm);
+	if (e)
+		refcount_inc(&e->users);
+	read_unlock(&misc->entries_lock);
+	return e;
+}
+
+/**
+ * put_binfmt_handler - put binary handler node
+ * @e: node to put
+ *
+ * Free node syncing with load_misc_binary() and defer final free to
+ * load_misc_binary() in case it is using the binary type handler we were
+ * requested to remove.
+ */
+static void put_binfmt_handler(Node *e)
+{
+	if (refcount_dec_and_test(&e->users)) {
+		if (e->flags & MISC_FMT_OPEN_FILE)
+			filp_close(e->interp_file, NULL);
+		kfree(e);
+	}
+}
+
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+	const struct user_namespace *user_ns;
+	struct binfmt_misc *misc;
+
+	user_ns = current_user_ns();
+	while (user_ns) {
+		/* Pairs with smp_store_release() in bm_fill_super(). */
+		misc = smp_load_acquire(&user_ns->binfmt_misc);
+		if (misc)
+			return misc;
+
+		user_ns = user_ns->parent;
+	}
+
+	return &init_binfmt_misc;
+}
+
 /*
  * the loader itself
  */
@@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file *interp_file = NULL;
-	int retval;
+	int retval = -ENOEXEC;
+	struct binfmt_misc *misc;
 
-	retval = -ENOEXEC;
-	if (!enabled)
+	misc = load_binfmt_misc();
+	if (!misc->enabled)
 		return retval;
 
-	/* to keep locking time low, we copy the interpreter string */
-	read_lock(&entries_lock);
-	fmt = check_file(bprm);
-	if (fmt)
-		dget(fmt->dentry);
-	read_unlock(&entries_lock);
+	fmt = get_binfmt_handler(misc, bprm);
 	if (!fmt)
 		return retval;
 
@@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm)
 
 	retval = 0;
 ret:
-	dput(fmt->dentry);
+
+	/*
+	 * If we actually put the node here all concurrent calls to
+	 * load_misc_binary() will have finished. We also know
+	 * that for the refcount to be zero someone must have concurently
+	 * removed the binary type handler from the list and it's our job to
+	 * free it.
+	 */
+	put_binfmt_handler(fmt);
+
 	return retval;
 }
 
@@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 	err = -ENOMEM;
 	memsize = sizeof(Node) + count + 8;
-	e = kmalloc(memsize, GFP_KERNEL);
+	e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
 	if (!e)
 		goto out;
 
@@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 			if (e->mask) {
 				int i;
-				char *masked = kmalloc(e->size, GFP_KERNEL);
+				char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
 
 				print_hex_dump_bytes(
 					KBUILD_MODNAME ": register:  mask[decoded]: ",
@@ -547,35 +622,114 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
 
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+	return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
+/**
+ * bm_evict_inode - cleanup data associated with @inode
+ * @inode: inode to which the data is attached
+ *
+ * Cleanup the binary type handler data associated with @inode if a binary type
+ * entry is removed or the filesystem is unmounted and the super block is
+ * shutdown.
+ *
+ * If the ->evict call was not caused by a super block shutdown but by a write
+ * to remove the entry or all entries via bm_{entry,status}_write() the entry
+ * will have already been removed from the list. We keep the list_empty() check
+ * to make that explicit.
+*/
 static void bm_evict_inode(struct inode *inode)
 {
 	Node *e = inode->i_private;
 
-	if (e && e->flags & MISC_FMT_OPEN_FILE)
-		filp_close(e->interp_file, NULL);
-
 	clear_inode(inode);
-	kfree(e);
+
+	if (e) {
+		struct binfmt_misc *misc;
+
+		misc = i_binfmt_misc(inode);
+		write_lock(&misc->entries_lock);
+		if (!list_empty(&e->list))
+			list_del_init(&e->list);
+		write_unlock(&misc->entries_lock);
+		put_binfmt_handler(e);
+	}
 }
 
-static void kill_node(Node *e)
+/**
+ * unlink_binfmt_dentry - remove the dentry for the binary type handler
+ * @dentry: dentry associated with the binary type handler
+ *
+ * Do the actual filesystem work to remove a dentry for a registered binary
+ * type handler. Since binfmt_misc only allows simple files to be created
+ * directly under the root dentry of the filesystem we ensure that we are
+ * indeed passed a dentry directly beneath the root dentry, that the inode
+ * associated with the root dentry is locked, and that it is a regular file we
+ * are asked to remove.
+ */
+static void unlink_binfmt_dentry(struct dentry *dentry)
 {
-	struct dentry *dentry;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode, *parent_inode;
 
-	write_lock(&entries_lock);
-	list_del_init(&e->list);
-	write_unlock(&entries_lock);
+	/* All entries are immediate descendants of the root dentry. */
+	if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
+		return;
 
-	dentry = e->dentry;
-	drop_nlink(d_inode(dentry));
-	d_drop(dentry);
-	dput(dentry);
-	simple_release_fs(&bm_mnt, &entry_count);
+	/* We only expect to be called on regular files. */
+	inode = d_inode(dentry);
+	if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
+		return;
+
+	/* The parent inode must be locked. */
+	parent_inode = d_inode(parent);
+	if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
+		return;
+
+	if (simple_positive(dentry)) {
+		dget(dentry);
+		simple_unlink(parent_inode, dentry);
+		d_delete(dentry);
+		dput(dentry);
+	}
+}
+
+/**
+ * remove_binfmt_handler - remove a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @e: binary type handler to remove
+ *
+ * Remove a binary type handler from the list of binary type handlers and
+ * remove its associated dentry. This is called from
+ * binfmt_{entry,status}_write(). In the future, we might want to think about
+ * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
+ * to use writes to files in order to delete binary type handlers. But it has
+ * worked for so long that it's not a pressing issue.
+ */
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
+{
+	write_lock(&misc->entries_lock);
+	list_del_init(&e->list);
+	write_unlock(&misc->entries_lock);
+	unlink_binfmt_dentry(e->dentry);
 }
 
 /* /<entry> */
@@ -602,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	struct dentry *root;
-	Node *e = file_inode(file)->i_private;
+	struct inode *inode = file_inode(file);
+	Node *e = inode->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -617,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		break;
 	case 3:
 		/* Delete this handler. */
-		root = file_inode(file)->i_sb->s_root;
-		inode_lock(d_inode(root));
+		inode = d_inode(inode->i_sb->s_root);
+		inode_lock(inode);
 
+		/*
+		 * In order to add new element or remove elements from the list
+		 * via bm_{entry,register,status}_write() inode_lock() on the
+		 * root inode must be held.
+		 * The lock is exclusive ensuring that the list can't be
+		 * modified. Only load_misc_binary() can access but does so
+		 * read-only. So we only need to take the write lock when we
+		 * actually remove the entry from the list.
+		 */
 		if (!list_empty(&e->list))
-			kill_node(e);
+			remove_binfmt_handler(i_binfmt_misc(inode), e);
 
-		inode_unlock(d_inode(root));
+		inode_unlock(inode);
 		break;
 	default:
 		return res;
@@ -647,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct dentry *root = sb->s_root, *dentry;
+	struct binfmt_misc *misc;
 	int err = 0;
 	struct file *f = NULL;
 
@@ -656,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	if (e->flags & MISC_FMT_OPEN_FILE) {
+		const struct cred *old_cred;
+
+		/*
+		 * Now that we support unprivileged binfmt_misc mounts make
+		 * sure we use the credentials that the register @file was
+		 * opened with to also open the interpreter. Before that this
+		 * didn't matter much as only a privileged process could open
+		 * the register file.
+		 */
+		old_cred = override_creds(file->f_cred);
 		f = open_exec(e->interpreter);
+		revert_creds(old_cred);
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
@@ -682,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
-	if (err) {
-		iput(inode);
-		inode = NULL;
-		goto out2;
-	}
-
+	refcount_set(&e->users, 1);
 	e->dentry = dget(dentry);
 	inode->i_private = e;
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	misc = i_binfmt_misc(inode);
+	write_lock(&misc->entries_lock);
+	list_add(&e->list, &misc->entries);
+	write_unlock(&misc->entries_lock);
 
 	err = 0;
 out2:
@@ -723,35 +893,50 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *misc;
+	char *s;
 
+	misc = i_binfmt_misc(file_inode(file));
+	s = misc->enabled ? "enabled\n" : "disabled\n";
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
 static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *misc;
 	int res = parse_command(buffer, count);
-	struct dentry *root;
+	Node *e, *next;
+	struct inode *inode;
 
+	misc = i_binfmt_misc(file_inode(file));
 	switch (res) {
 	case 1:
 		/* Disable all handlers. */
-		enabled = 0;
+		misc->enabled = false;
 		break;
 	case 2:
 		/* Enable all handlers. */
-		enabled = 1;
+		misc->enabled = true;
 		break;
 	case 3:
 		/* Delete all handlers. */
-		root = file_inode(file)->i_sb->s_root;
-		inode_lock(d_inode(root));
+		inode = d_inode(file_inode(file)->i_sb->s_root);
+		inode_lock(inode);
 
-		while (!list_empty(&entries))
-			kill_node(list_first_entry(&entries, Node, list));
+		/*
+		 * In order to add new element or remove elements from the list
+		 * via bm_{entry,register,status}_write() inode_lock() on the
+		 * root inode must be held.
+		 * The lock is exclusive ensuring that the list can't be
+		 * modified. Only load_misc_binary() can access but does so
+		 * read-only. So we only need to take the write lock when we
+		 * actually remove the entry from the list.
+		 */
+		list_for_each_entry_safe(e, next, &misc->entries, list)
+			remove_binfmt_handler(misc, e);
 
-		inode_unlock(d_inode(root));
+		inode_unlock(inode);
 		break;
 	default:
 		return res;
@@ -768,32 +953,100 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct user_namespace *user_ns = sb->s_fs_info;
+
+	sb->s_fs_info = NULL;
+	put_user_ns(user_ns);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int err;
+	struct user_namespace *user_ns = sb->s_user_ns;
+	struct binfmt_misc *misc;
 	static const struct tree_descr bm_files[] = {
 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
 
+	if (WARN_ON(user_ns != current_user_ns()))
+		return -EINVAL;
+
+	/*
+	 * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+	 * do it here during the first mount of binfmt_misc. We don't need to
+	 * waste memory for every user namespace allocation. It's likely much
+	 * more common to not mount a separate binfmt_misc instance than it is
+	 * to mount one.
+	 *
+	 * While multiple superblocks can exist they are keyed by userns in
+	 * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+	 * bm_fill_super() is called exactly once whenever a binfmt_misc
+	 * superblock for a userns is created. This in turn lets us conclude
+	 * that when a binfmt_misc superblock is created for the first time for
+	 * a userns there's no one racing us. Therefore we don't need any
+	 * barriers when we dereference binfmt_misc.
+	 */
+	misc = user_ns->binfmt_misc;
+	if (!misc) {
+		/*
+		 * If it turns out that most user namespaces actually want to
+		 * register their own binary type handler and therefore all
+		 * create their own separate binfm_misc mounts we should
+		 * consider turning this into a kmem cache.
+		 */
+		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!misc)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&misc->entries);
+		rwlock_init(&misc->entries_lock);
+
+		/* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+		smp_store_release(&user_ns->binfmt_misc, misc);
+	}
+
+	/*
+	 * When the binfmt_misc superblock for this userns is shutdown
+	 * ->enabled might have been set to false and we don't reinitialize
+	 * ->enabled again in put_super() as someone might already be mounting
+	 * binfmt_misc again. It also would be pointless since by the time
+	 * ->put_super() is called we know that the binary type list for this
+	 * bintfmt_misc mount is empty making load_misc_binary() return
+	 * -ENOEXEC independent of whether ->enabled is true. Instead, if
+	 * someone mounts binfmt_misc for the first time or again we simply
+	 * reset ->enabled to true.
+	 */
+	misc->enabled = true;
+
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
 	if (!err)
 		sb->s_op = &s_ops;
 	return err;
 }
 
+static void bm_free(struct fs_context *fc)
+{
+	if (fc->s_fs_info)
+		put_user_ns(fc->s_fs_info);
+}
+
 static int bm_get_tree(struct fs_context *fc)
 {
-	return get_tree_single(fc, bm_fill_super);
+	return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
 }
 
 static const struct fs_context_operations bm_context_ops = {
+	.free		= bm_free,
 	.get_tree	= bm_get_tree,
 };
 
@@ -812,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
 	.init_fs_context = bm_init_fs_context,
+	.fs_flags	= FS_USERNS_MOUNT,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("binfmt_misc");
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a25c9910d90b..4fb925e8c981 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL
 
 	  If you don't know what Access Control Lists are, say N
 
-config BTRFS_FS_CHECK_INTEGRITY
-	bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
-	depends on BTRFS_FS
-	help
-	  This feature has been deprecated and will be removed in 6.7.
-
-	  Adds code that examines all block write requests (including
-	  writes of the super block). The goal is to verify that the
-	  state of the filesystem on disk is always consistent, i.e.,
-	  after a power-loss or kernel panic event the filesystem is
-	  in a consistent state.
-
-	  If the integrity check tool is included and activated in
-	  the mount options, plenty of kernel memory is used, and
-	  plenty of additional CPU cycles are spent. Enabling this
-	  functionality is not intended for normal use.
-
-	  In most cases, unless you are a btrfs developer who needs
-	  to verify the integrity of (super)-block write requests
-	  during the run of a regression test, say N
-
 config BTRFS_FS_RUN_SANITY_TESTS
 	bool "Btrfs will run sanity tests upon loading"
 	depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..525af975f61c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
 	   subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
-	   lru_cache.o
+	   lru_cache.o raid-stripe-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
 btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 8cfc8214109c..aa0844535644 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -4,6 +4,7 @@
 #define BTRFS_ACCESSORS_H
 
 #include <linux/stddef.h>
+#include <asm/unaligned.h>
 
 struct btrfs_map_token {
 	struct extent_buffer *eb;
@@ -305,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+			 struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
 BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
@@ -349,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3
 
 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
 
+BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
+		   root_id, 64);
+
 BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
 		   type, 8);
 BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
@@ -365,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
 	if (type == BTRFS_EXTENT_DATA_REF_KEY)
 		return sizeof(struct btrfs_extent_data_ref) +
 		       offsetof(struct btrfs_extent_inline_ref, offset);
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
 	return 0;
 }
 
@@ -966,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
 		   flags, 64);
 BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
 		   rescan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
+		   enable_gen, 64);
 
 /* btrfs_qgroup_info_item */
 BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ce083e99ef68..9e261aac671e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <trace/events/btrfs.h>
 #include "async-thread.h"
 #include "ctree.h"
 
@@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 			break;
 		trace_btrfs_ordered_sched(work);
 		spin_unlock_irqrestore(lock, flags);
-		work->ordered_func(work);
+		work->ordered_func(work, false);
 
 		/* now take the lock again and drop our item from the list */
 		spin_lock_irqsave(lock, flags);
@@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 			 * We don't want to call the ordered free functions with
 			 * the lock held.
 			 */
-			work->ordered_free(work);
+			work->ordered_func(work, true);
 			/* NB: work must not be dereferenced past this point. */
 			trace_btrfs_all_work_done(wq->fs_info, work);
 		}
@@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
 	spin_unlock_irqrestore(lock, flags);
 
 	if (free_self) {
-		self->ordered_free(self);
+		self->ordered_func(self, true);
 		/* NB: self must not be dereferenced past this point. */
 		trace_btrfs_all_work_done(wq->fs_info, self);
 	}
@@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
 
 	/*
 	 * We should not touch things inside work in the following cases:
-	 * 1) after work->func() if it has no ordered_free
+	 * 1) after work->func() if it has no ordered_func(..., true) to free
 	 *    Since the struct is freed in work->func().
 	 * 2) after setting WORK_DONE_BIT
 	 *    The work may be freed in other threads almost instantly.
@@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work)
 }
 
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
-		     btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+		     btrfs_ordered_func_t ordered_func)
 {
 	work->func = func;
 	work->ordered_func = ordered_func;
-	work->ordered_free = ordered_free;
 	INIT_WORK(&work->normal_work, btrfs_work_helper);
 	INIT_LIST_HEAD(&work->ordered_list);
 	work->flags = 0;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 30f66c5e2e6e..62b8a0d57898 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,11 +13,11 @@ struct btrfs_fs_info;
 struct btrfs_workqueue;
 struct btrfs_work;
 typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
 
 struct btrfs_work {
 	btrfs_func_t func;
-	btrfs_func_t ordered_func;
-	btrfs_func_t ordered_free;
+	btrfs_ordered_func_t ordered_func;
 
 	/* Don't touch things below */
 	struct work_struct normal_work;
@@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
 				struct btrfs_fs_info *fs_info, const char *name,
 				unsigned int flags);
 void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
-		     btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+		     btrfs_ordered_func_t ordered_func);
 void btrfs_queue_work(struct btrfs_workqueue *wq,
 		      struct btrfs_work *work);
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a4a809efc92f..beed7e459dab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
 						       count, sc, GFP_NOFS);
 			break;
 		}
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			WARN_ON(1);
 		}
@@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
 }
 
 void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
-			      struct btrfs_backref_cache *cache, int is_reloc)
+			      struct btrfs_backref_cache *cache, bool is_reloc)
 {
 	int i;
 
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 71d535e03dca..ab4ca0eda605 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -247,7 +247,7 @@ struct prelim_ref {
 	struct rb_node rbnode;
 	u64 root_id;
 	struct btrfs_key key_for_search;
-	int level;
+	u8 level;
 	int count;
 	struct extent_inode_elem *inode_list;
 	u64 parent;
@@ -440,11 +440,11 @@ struct btrfs_backref_cache {
 	 * Reloction backref cache require more info for reloc root compared
 	 * to generic backref cache.
 	 */
-	unsigned int is_reloc;
+	bool is_reloc;
 };
 
 void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
-			      struct btrfs_backref_cache *cache, int is_reloc);
+			      struct btrfs_backref_cache *cache, bool is_reloc);
 struct btrfs_backref_node *btrfs_backref_alloc_node(
 		struct btrfs_backref_cache *cache, u64 bytenr, int level);
 struct btrfs_backref_edge *btrfs_backref_alloc_edge(
@@ -533,9 +533,9 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
 void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
 
 static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
-				       u64 bytenr, int errno)
+				       u64 bytenr, int error)
 {
-	btrfs_panic(fs_info, errno,
+	btrfs_panic(fs_info, error,
 		    "Inconsistency in backref cache found at offset %llu",
 		    bytenr);
 }
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 12b12443efaa..4f3b693a16b1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -10,11 +10,11 @@
 #include "volumes.h"
 #include "raid56.h"
 #include "async-thread.h"
-#include "check-integrity.h"
 #include "dev-replace.h"
 #include "rcu-string.h"
 #include "zoned.h"
 #include "file-item.h"
+#include "raid-stripe-tree.h"
 
 static struct bio_set btrfs_bioset;
 static struct bio_set btrfs_clone_bioset;
@@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
 	else
 		bio->bi_status = BLK_STS_OK;
 
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
 	btrfs_orig_bbio_end_io(bbio);
 	btrfs_put_bioc(bioc);
 }
@@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
 	if (bio->bi_status) {
 		atomic_inc(&stripe->bioc->error);
 		btrfs_log_dev_io_error(bio, stripe->dev);
+	} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 	}
 
 	/* Pass on control to the original bio this one was cloned from */
@@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
 		(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
 		dev->devid, bio->bi_iter.bi_size);
 
-	btrfsic_check_bio(bio);
-
 	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
 		blkcg_punt_bio_submit(bio);
 	else
@@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
 	bio->bi_private = &bioc->stripes[dev_nr];
 	bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
 	bioc->stripes[dev_nr].bioc = bioc;
+	bioc->size = bio->bi_iter.bi_size;
 	btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
 }
 
@@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
 	if (!bioc) {
 		/* Single mirror read/write fast path. */
 		btrfs_bio(bio)->mirror_num = mirror_num;
+		if (bio_op(bio) != REQ_OP_READ)
+			btrfs_bio(bio)->orig_physical = smap->physical;
 		bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
 		if (bio_op(bio) != REQ_OP_READ)
 			btrfs_bio(bio)->orig_physical = smap->physical;
@@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work)
  *
  * At IO completion time the csums attached on the ordered extent record are
  * inserted into the tree.
+ *
+ * If called with @do_free == true, then it will free the work struct.
  */
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work *work, bool do_free)
 {
 	struct async_submit_bio *async =
 		container_of(work, struct async_submit_bio, work);
 	struct bio *bio = &async->bbio->bio;
 
+	if (do_free) {
+		kfree(container_of(work, struct async_submit_bio, work));
+		return;
+	}
+
 	/* If an error occurred we just want to clean up the bio and move on. */
 	if (bio->bi_status) {
 		btrfs_orig_bbio_end_io(async->bbio);
@@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work)
 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
 }
 
-static void run_one_async_free(struct btrfs_work *work)
-{
-	kfree(container_of(work, struct async_submit_bio, work));
-}
-
 static bool should_async_write(struct btrfs_bio *bbio)
 {
 	/* Submit synchronously if the checksum implementation is fast. */
@@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
 	async->smap = *smap;
 	async->mirror_num = mirror_num;
 
-	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
-			run_one_async_free);
+	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
 	btrfs_queue_work(fs_info->workers, &async->work);
 	return true;
 }
@@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 	blk_status_t ret;
 	int error;
 
+	smap.is_scrub = !bbio->inode;
+
 	btrfs_bio_counter_inc_blocked(fs_info);
 	error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
-				&bioc, &smap, &mirror_num, 1);
+				&bioc, &smap, &mirror_num);
 	if (error) {
 		ret = errno_to_blk_status(error);
 		goto fail;
@@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
 		}
 
+		if (is_data_bbio(bbio) && bioc &&
+		    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+			/*
+			 * No locking for the list update, as we only add to
+			 * the list in the I/O submission path, and list
+			 * iteration only happens in the completion path, which
+			 * can't happen until after the last submission.
+			 */
+			btrfs_get_bioc(bioc);
+			list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+		}
+
 		/*
 		 * Csum items for reloc roots have already been cloned at this
 		 * point, so they are handled as part of the no-checksum case.
@@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
 	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
 	__bio_add_page(&bio, page, length, pg_offset);
-
-	btrfsic_check_bio(&bio);
 	ret = submit_bio_wait(&bio);
 	if (ret) {
 		/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b2e5107b7cec..6e5dc68ff661 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -935,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 	caching_ctl->block_group = cache;
 	refcount_set(&caching_ctl->count, 2);
 	atomic_set(&caching_ctl->progress, 0);
-	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+	btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
 
 	spin_lock(&cache->lock);
 	if (cache->cached != BTRFS_CACHE_NO) {
@@ -1286,7 +1286,7 @@ out:
 	/* Once for the lookup reference */
 	btrfs_put_block_group(block_group);
 	if (remove_rsv)
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2601,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 
 		/* Already aborted the transaction if it failed. */
 next:
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
 		list_del_init(&block_group->bg_list);
 		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
 	}
@@ -2819,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 #endif
 
 	list_add_tail(&cache->bg_list, &trans->new_bgs);
-	trans->delayed_ref_updates++;
-	btrfs_update_delayed_refs_rsv(trans);
+	btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
 
 	set_avail_alloc_bits(fs_info, type);
 	return cache;
@@ -3025,7 +3024,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
 						   cache->global_root_id);
 	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
 	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 fail:
 	btrfs_release_path(path);
 	/*
@@ -3051,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
 			    struct btrfs_path *path)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	struct btrfs_root *root = fs_info->tree_root;
 	struct inode *inode = NULL;
 	struct extent_changeset *data_reserved = NULL;
 	u64 alloc_hint = 0;
@@ -3103,7 +3101,7 @@ again:
 	 * time.
 	 */
 	BTRFS_I(inode)->generation = 0;
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret) {
 		/*
 		 * So theoretically we could recover from this, simply set the
@@ -3370,7 +3368,7 @@ again:
 		if (should_put)
 			btrfs_put_block_group(cache);
 		if (drop_reserve)
-			btrfs_delayed_refs_rsv_release(fs_info, 1);
+			btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		/*
 		 * Avoid blocking other tasks for too long. It might even save
 		 * us from writing caches for block groups that are going to be
@@ -3474,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 		cache_save_setup(cache, trans, path);
 
 		if (!ret)
-			ret = btrfs_run_delayed_refs(trans,
-						     (unsigned long) -1);
+			ret = btrfs_run_delayed_refs(trans, U64_MAX);
 
 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
 			cache->io_ctl.inode = NULL;
@@ -3518,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 		/* If its not on the io list, we need to put the block group */
 		if (should_put)
 			btrfs_put_block_group(cache);
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -3543,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 			     u64 bytenr, u64 num_bytes, bool alloc)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
-	struct btrfs_block_group *cache = NULL;
-	u64 total = num_bytes;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_group *cache;
 	u64 old_val;
-	u64 byte_in_group;
+	bool reclaim = false;
+	bool bg_already_dirty = true;
 	int factor;
-	int ret = 0;
 
 	/* Block accounting for super block */
 	spin_lock(&info->delalloc_root_lock);
@@ -3560,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
 	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_root_lock);
 
-	while (total) {
-		struct btrfs_space_info *space_info;
-		bool reclaim = false;
-
-		cache = btrfs_lookup_block_group(info, bytenr);
-		if (!cache) {
-			ret = -ENOENT;
-			break;
-		}
-		space_info = cache->space_info;
-		factor = btrfs_bg_type_to_factor(cache->flags);
+	cache = btrfs_lookup_block_group(info, bytenr);
+	if (!cache)
+		return -ENOENT;
 
-		/*
-		 * If this block group has free space cache written out, we
-		 * need to make sure to load it if we are removing space.  This
-		 * is because we need the unpinning stage to actually add the
-		 * space back to the block group, otherwise we will leak space.
-		 */
-		if (!alloc && !btrfs_block_group_done(cache))
-			btrfs_cache_block_group(cache, true);
+	/* An extent can not span multiple block groups. */
+	ASSERT(bytenr + num_bytes <= cache->start + cache->length);
 
-		byte_in_group = bytenr - cache->start;
-		WARN_ON(byte_in_group > cache->length);
+	space_info = cache->space_info;
+	factor = btrfs_bg_type_to_factor(cache->flags);
 
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
+	/*
+	 * If this block group has free space cache written out, we need to make
+	 * sure to load it if we are removing space.  This is because we need
+	 * the unpinning stage to actually add the space back to the block group,
+	 * otherwise we will leak space.
+	 */
+	if (!alloc && !btrfs_block_group_done(cache))
+		btrfs_cache_block_group(cache, true);
 
-		if (btrfs_test_opt(info, SPACE_CACHE) &&
-		    cache->disk_cache_state < BTRFS_DC_CLEAR)
-			cache->disk_cache_state = BTRFS_DC_CLEAR;
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
 
-		old_val = cache->used;
-		num_bytes = min(total, cache->length - byte_in_group);
-		if (alloc) {
-			old_val += num_bytes;
-			cache->used = old_val;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->bytes_used += num_bytes;
-			space_info->disk_used += num_bytes * factor;
-			spin_unlock(&cache->lock);
-			spin_unlock(&space_info->lock);
-		} else {
-			old_val -= num_bytes;
-			cache->used = old_val;
-			cache->pinned += num_bytes;
-			btrfs_space_info_update_bytes_pinned(info, space_info,
-							     num_bytes);
-			space_info->bytes_used -= num_bytes;
-			space_info->disk_used -= num_bytes * factor;
+	if (btrfs_test_opt(info, SPACE_CACHE) &&
+	    cache->disk_cache_state < BTRFS_DC_CLEAR)
+		cache->disk_cache_state = BTRFS_DC_CLEAR;
 
-			reclaim = should_reclaim_block_group(cache, num_bytes);
+	old_val = cache->used;
+	if (alloc) {
+		old_val += num_bytes;
+		cache->used = old_val;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+		space_info->bytes_used += num_bytes;
+		space_info->disk_used += num_bytes * factor;
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
+	} else {
+		old_val -= num_bytes;
+		cache->used = old_val;
+		cache->pinned += num_bytes;
+		btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+		space_info->bytes_used -= num_bytes;
+		space_info->disk_used -= num_bytes * factor;
 
-			spin_unlock(&cache->lock);
-			spin_unlock(&space_info->lock);
+		reclaim = should_reclaim_block_group(cache, num_bytes);
 
-			set_extent_bit(&trans->transaction->pinned_extents,
-				       bytenr, bytenr + num_bytes - 1,
-				       EXTENT_DIRTY, NULL);
-		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&space_info->lock);
 
-		spin_lock(&trans->transaction->dirty_bgs_lock);
-		if (list_empty(&cache->dirty_list)) {
-			list_add_tail(&cache->dirty_list,
-				      &trans->transaction->dirty_bgs);
-			trans->delayed_ref_updates++;
-			btrfs_get_block_group(cache);
-		}
-		spin_unlock(&trans->transaction->dirty_bgs_lock);
+		set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+			       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+	}
 
-		/*
-		 * No longer have used bytes in this block group, queue it for
-		 * deletion. We do this after adding the block group to the
-		 * dirty list to avoid races between cleaner kthread and space
-		 * cache writeout.
-		 */
-		if (!alloc && old_val == 0) {
-			if (!btrfs_test_opt(info, DISCARD_ASYNC))
-				btrfs_mark_bg_unused(cache);
-		} else if (!alloc && reclaim) {
-			btrfs_mark_bg_to_reclaim(cache);
-		}
+	spin_lock(&trans->transaction->dirty_bgs_lock);
+	if (list_empty(&cache->dirty_list)) {
+		list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+		bg_already_dirty = false;
+		btrfs_get_block_group(cache);
+	}
+	spin_unlock(&trans->transaction->dirty_bgs_lock);
 
-		btrfs_put_block_group(cache);
-		total -= num_bytes;
-		bytenr += num_bytes;
+	/*
+	 * No longer have used bytes in this block group, queue it for deletion.
+	 * We do this after adding the block group to the dirty list to avoid
+	 * races between cleaner kthread and space cache writeout.
+	 */
+	if (!alloc && old_val == 0) {
+		if (!btrfs_test_opt(info, DISCARD_ASYNC))
+			btrfs_mark_bg_unused(cache);
+	} else if (!alloc && reclaim) {
+		btrfs_mark_bg_to_reclaim(cache);
 	}
 
+	btrfs_put_block_group(cache);
+
 	/* Modified block groups are accounted for in the delayed_refs_rsv. */
-	btrfs_update_delayed_refs_rsv(trans);
-	return ret;
+	if (!bg_already_dirty)
+		btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 77684c5e0c8b..ceb5f586a2d5 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   num_bytes, flush);
 	if (!ret)
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
 
@@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
 	if (!ret)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   num_bytes, flush);
 	if (!ret) {
 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
 		return 0;
@@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 	struct btrfs_block_rsv *target = NULL;
 
 	/*
-	 * If we are the delayed_rsv then push to the global rsv, otherwise dump
-	 * into the delayed rsv if it is not full.
+	 * If we are a delayed block reserve then push to the global rsv,
+	 * otherwise dump into the global delayed reserve if it is not full.
 	 */
-	if (block_rsv == delayed_rsv)
+	if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
 		target = global_rsv;
 	else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
 		target = delayed_rsv;
@@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		min_items++;
 	}
 
+	if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+		num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
+		min_items++;
+	}
+
 	/*
 	 * But we also want to reserve enough space so we can do the fallback
 	 * global reserve for an unlink, which is an additional
@@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
 	case BTRFS_EXTENT_TREE_OBJECTID:
 	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 	case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
 		root->block_rsv = &fs_info->delayed_refs_rsv;
 		break;
 	case BTRFS_ROOT_TREE_OBJECTID:
@@ -517,8 +525,8 @@ again:
 				block_rsv->type, ret);
 	}
 try_reserve:
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
-					   BTRFS_RESERVE_NO_FLUSH);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   blocksize, BTRFS_RESERVE_NO_FLUSH);
 	if (!ret)
 		return block_rsv;
 	/*
@@ -539,7 +547,7 @@ try_reserve:
 	 * one last time to force a reservation if there's enough actual space
 	 * on disk to make the reservation.
 	 */
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
 					   BTRFS_RESERVE_FLUSH_EMERGENCY);
 	if (!ret)
 		return block_rsv;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bda1fdbba666..5572ae52444e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,6 +8,8 @@
 
 #include <linux/hash.h>
 #include <linux/refcount.h>
+#include <linux/fscrypt.h>
+#include <trace/events/btrfs.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
@@ -79,11 +81,21 @@ struct btrfs_inode {
 	 */
 	struct btrfs_key location;
 
+	/* Cached value of inode property 'compression'. */
+	u8 prop_compress;
+
+	/*
+	 * Force compression on the file using the defrag ioctl, could be
+	 * different from prop_compress and takes precedence if set.
+	 */
+	u8 defrag_compress;
+
 	/*
 	 * Lock for counters and all fields used to determine if the inode is in
 	 * the log or not (last_trans, last_sub_trans, last_log_commit,
-	 * logged_trans), to access/update new_delalloc_bytes and to update the
-	 * VFS' inode number of bytes used.
+	 * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
+	 * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
+	 * update the VFS' inode number of bytes used.
 	 */
 	spinlock_t lock;
 
@@ -102,8 +114,18 @@ struct btrfs_inode {
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/*
+	 * Counters to keep track of the number of extent item's we may use due
+	 * to delalloc and such.  outstanding_extents is the number of extent
+	 * items we think we'll end up using, and reserved_extents is the number
+	 * of extent items we've reserved metadata for. Protected by 'lock'.
+	 */
+	unsigned outstanding_extents;
+
 	/* used to order data wrt metadata */
-	struct btrfs_ordered_inode_tree ordered_tree;
+	spinlock_t ordered_tree_lock;
+	struct rb_root ordered_tree;
+	struct rb_node *ordered_tree_last;
 
 	/* list of all the delalloc inodes in the FS.  There are times we need
 	 * to write all the delalloc pages to disk, and this list is used
@@ -122,28 +144,31 @@ struct btrfs_inode {
 	u64 generation;
 
 	/*
-	 * transid of the trans_handle that last modified this inode
+	 * ID of the transaction handle that last modified this inode.
+	 * Protected by 'lock'.
 	 */
 	u64 last_trans;
 
 	/*
-	 * transid that last logged this inode
+	 * ID of the transaction that last logged this inode.
+	 * Protected by 'lock'.
 	 */
 	u64 logged_trans;
 
 	/*
-	 * log transid when this inode was last modified
+	 * Log transaction ID when this inode was last modified.
+	 * Protected by 'lock'.
 	 */
 	int last_sub_trans;
 
-	/* a local copy of root's last_log_commit */
+	/* A local copy of root's last_log_commit. Protected by 'lock'. */
 	int last_log_commit;
 
 	union {
 		/*
 		 * Total number of bytes pending delalloc, used by stat to
 		 * calculate the real block usage of the file. This is used
-		 * only for files.
+		 * only for files. Protected by 'lock'.
 		 */
 		u64 delalloc_bytes;
 		/*
@@ -161,7 +186,7 @@ struct btrfs_inode {
 		 * Total number of bytes pending delalloc that fall within a file
 		 * range that is either a hole or beyond EOF (and no prealloc extent
 		 * exists in the range). This is always <= delalloc_bytes and this
-		 * is used only for files.
+		 * is used only for files. Protected by 'lock'.
 		 */
 		u64 new_delalloc_bytes;
 		/*
@@ -172,15 +197,15 @@ struct btrfs_inode {
 	};
 
 	/*
-	 * total number of bytes pending defrag, used by stat to check whether
-	 * it needs COW.
+	 * Total number of bytes pending defrag, used by stat to check whether
+	 * it needs COW. Protected by 'lock'.
 	 */
 	u64 defrag_bytes;
 
 	/*
-	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * The size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
-	 * because not all the blocks are written yet.
+	 * because not all the blocks are written yet. Protected by 'lock'.
 	 */
 	u64 disk_i_size;
 
@@ -214,7 +239,7 @@ struct btrfs_inode {
 
 	/*
 	 * Number of bytes outstanding that are going to need csums.  This is
-	 * used in ENOSPC accounting.
+	 * used in ENOSPC accounting. Protected by 'lock'.
 	 */
 	u64 csum_bytes;
 
@@ -223,30 +248,13 @@ struct btrfs_inode {
 	/* Read-only compatibility flags, upper half of inode_item::flags */
 	u32 ro_flags;
 
-	/*
-	 * Counters to keep track of the number of extent item's we may use due
-	 * to delalloc and such.  outstanding_extents is the number of extent
-	 * items we think we'll end up using, and reserved_extents is the number
-	 * of extent items we've reserved metadata for.
-	 */
-	unsigned outstanding_extents;
-
 	struct btrfs_block_rsv block_rsv;
 
-	/*
-	 * Cached values of inode properties
-	 */
-	unsigned prop_compress;		/* per-file compression algorithm */
-	/*
-	 * Force compression on the file using the defrag ioctl, could be
-	 * different from prop_compress and takes precedence if set
-	 */
-	unsigned defrag_compress;
-
 	struct btrfs_delayed_node *delayed_node;
 
 	/* File creation time. */
-	struct timespec64 i_otime;
+	u64 i_otime_sec;
+	u32 i_otime_nsec;
 
 	/* Hook into fs_info->delayed_iputs */
 	struct list_head delayed_iput;
@@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 	spin_lock(&inode->lock);
 	if (inode->logged_trans == generation &&
 	    inode->last_sub_trans <= inode->last_log_commit &&
-	    inode->last_sub_trans <= inode->root->last_log_commit)
+	    inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
 		ret = true;
 	spin_unlock(&inode->lock);
 	return ret;
@@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 				    struct page *page, size_t pg_offset,
 				    u64 start, u64 end);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_inode *inode);
+		       struct btrfs_inode *inode);
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct btrfs_inode *inode);
+				struct btrfs_inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
deleted file mode 100644
index 3caf339c4bb3..000000000000
--- a/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,2871 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- *    currently referenced by the super block (either directly
- *    or indirectly).
- * 2. When a super block is written, it is verified that all
- *    referenced (directly or indirectly) blocks fulfill the
- *    following requirements:
- *    2a. All referenced blocks have either been present when
- *        the file system was mounted, (i.e., they have been
- *        referenced by the super block) or they have been
- *        written since then and the write completion callback
- *        was called and no write error was indicated and a
- *        FLUSH request to the device where these blocks are
- *        located was received and completed.
- *    2b. All referenced blocks need to have a generation
- *        number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- *
- * Expect millions of lines of information in the kernel log with an
- * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
- * kernel config to at least 26 (which is 64MB). Usually the value is
- * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
- * changed like this before LOG_BUF_SHIFT can be set to a high value:
- * config LOG_BUF_SHIFT
- *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- *       range 12 30
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <crypto/hash.h>
-#include "messages.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
-#include "compression.h"
-#include "accessors.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)	/* in characters,
-							 * excluding " [...]" */
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE			0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION		0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE			0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE			0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH			0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH			0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE				0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE				0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE				0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES			0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE			0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES				0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS		0x00001000
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE		0x00002000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
-	u32 magic_num;		/* only used for debug purposes */
-	unsigned int is_metadata:1;	/* if it is meta-data, not data-data */
-	unsigned int is_superblock:1;	/* if it is one of the superblocks */
-	unsigned int is_iodone:1;	/* if is done by lower subsystem */
-	unsigned int iodone_w_error:1;	/* error was indicated to endio */
-	unsigned int never_written:1;	/* block was added because it was
-					 * referenced, not because it was
-					 * written */
-	unsigned int mirror_num;	/* large enough to hold
-					 * BTRFS_SUPER_MIRROR_MAX */
-	struct btrfsic_dev_state *dev_state;
-	u64 dev_bytenr;		/* key, physical byte num on disk */
-	u64 logical_bytenr;	/* logical byte num on disk */
-	u64 generation;
-	struct btrfs_disk_key disk_key;	/* extra info to print in case of
-					 * issues, will not always be correct */
-	struct list_head collision_resolving_node;	/* list node */
-	struct list_head all_blocks_node;	/* list node */
-
-	/* the following two lists contain block_link items */
-	struct list_head ref_to_list;	/* list */
-	struct list_head ref_from_list;	/* list */
-	struct btrfsic_block *next_in_same_bio;
-	void *orig_bio_private;
-	bio_end_io_t *orig_bio_end_io;
-	blk_opf_t submit_bio_bh_rw;
-	u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block referred from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
-	u32 magic_num;		/* only used for debug purposes */
-	u32 ref_cnt;
-	struct list_head node_ref_to;	/* list node */
-	struct list_head node_ref_from;	/* list node */
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block *block_ref_to;
-	struct btrfsic_block *block_ref_from;
-	u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
-	u32 magic_num;		/* only used for debug purposes */
-	struct block_device *bdev;
-	struct btrfsic_state *state;
-	struct list_head collision_resolving_node;	/* list node */
-	struct btrfsic_block dummy_block_for_bio_bh_flush;
-	u64 last_flush_gen;
-};
-
-struct btrfsic_block_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
-	struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
-	struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
-	u64 start;		/* virtual bytenr */
-	u64 dev_bytenr;		/* physical bytenr on device */
-	u32 len;
-	struct btrfsic_dev_state *dev;
-	char **datav;
-	struct page **pagev;
-	void *mem_to_free;
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
-	u32 magic;
-	u32 nr;
-	int error;
-	int i;
-	int limit_nesting;
-	int num_copies;
-	int mirror_num;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx *block_ctx;
-	struct btrfsic_block *next_block;
-	struct btrfsic_block_data_ctx next_block_ctx;
-	struct btrfs_header *hdr;
-	struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
-	u32 print_mask;
-	int include_extent_data;
-	struct list_head all_blocks_list;
-	struct btrfsic_block_hashtable block_hashtable;
-	struct btrfsic_block_link_hashtable block_link_hashtable;
-	struct btrfs_fs_info *fs_info;
-	u64 max_superblock_generation;
-	struct btrfsic_block *latest_superblock;
-	u32 metablock_size;
-	u32 datablock_size;
-};
-
-static int btrfsic_process_metablock(struct btrfsic_state *state,
-				     struct btrfsic_block *block,
-				     struct btrfsic_block_data_ctx *block_ctx,
-				     int limit_nesting, int force_iodone_flag);
-static void btrfsic_read_from_block_data(
-	struct btrfsic_block_data_ctx *block_ctx,
-	void *dst, u32 offset, size_t len);
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx
-		*block_ctx, u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
-				      struct btrfsic_block *block,
-				      struct btrfsic_block_data_ctx *block_ctx,
-				      u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const block,
-		struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
-					      const struct btrfsic_block *block,
-					      int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
-	b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
-	b->dev_state = NULL;
-	b->dev_bytenr = 0;
-	b->logical_bytenr = 0;
-	b->generation = BTRFSIC_GENERATION_UNKNOWN;
-	b->disk_key.objectid = 0;
-	b->disk_key.type = 0;
-	b->disk_key.offset = 0;
-	b->is_metadata = 0;
-	b->is_superblock = 0;
-	b->is_iodone = 0;
-	b->iodone_w_error = 0;
-	b->never_written = 0;
-	b->mirror_num = 0;
-	b->next_in_same_bio = NULL;
-	b->orig_bio_private = NULL;
-	b->orig_bio_end_io = NULL;
-	INIT_LIST_HEAD(&b->collision_resolving_node);
-	INIT_LIST_HEAD(&b->all_blocks_node);
-	INIT_LIST_HEAD(&b->ref_to_list);
-	INIT_LIST_HEAD(&b->ref_from_list);
-	b->submit_bio_bh_rw = 0;
-	b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
-	struct btrfsic_block *b;
-
-	b = kzalloc(sizeof(*b), GFP_NOFS);
-	if (NULL != b)
-		btrfsic_block_init(b);
-
-	return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
-	BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
-	kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
-	l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
-	l->ref_cnt = 1;
-	INIT_LIST_HEAD(&l->node_ref_to);
-	INIT_LIST_HEAD(&l->node_ref_from);
-	INIT_LIST_HEAD(&l->collision_resolving_node);
-	l->block_ref_to = NULL;
-	l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
-	struct btrfsic_block_link *l;
-
-	l = kzalloc(sizeof(*l), GFP_NOFS);
-	if (NULL != l)
-		btrfsic_block_link_init(l);
-
-	return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
-	BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
-	kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
-	ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
-	ds->bdev = NULL;
-	ds->state = NULL;
-	INIT_LIST_HEAD(&ds->collision_resolving_node);
-	ds->last_flush_gen = 0;
-	btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
-	ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
-	ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
-	struct btrfsic_dev_state *ds;
-
-	ds = kzalloc(sizeof(*ds), GFP_NOFS);
-	if (NULL != ds)
-		btrfsic_dev_state_init(ds);
-
-	return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
-	BUG_ON(!(NULL == ds ||
-		 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
-	kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
-					struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(b->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
-	list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
-	list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
-		struct block_device *bdev,
-		u64 dev_bytenr,
-		struct btrfsic_block_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev))) &
-	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-	struct btrfsic_block *b;
-
-	list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
-		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
-			return b;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
-		struct btrfsic_block_link_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
-		struct btrfsic_block_link *l,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
-	     ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
-	     ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
-	     & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
-	BUG_ON(NULL == l->block_ref_to);
-	BUG_ON(NULL == l->block_ref_from);
-	list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
-	list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
-		struct block_device *bdev_ref_to,
-		u64 dev_bytenr_ref_to,
-		struct block_device *bdev_ref_from,
-		u64 dev_bytenr_ref_from,
-		struct btrfsic_block_link_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
-	     ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
-	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
-	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-	struct btrfsic_block_link *l;
-
-	list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
-		BUG_ON(NULL == l->block_ref_to);
-		BUG_ON(NULL == l->block_ref_from);
-		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
-		    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
-		    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
-		    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
-			return l;
-	}
-
-	return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
-		struct btrfsic_dev_state_hashtable *h)
-{
-	int i;
-
-	for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
-		INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
-		struct btrfsic_dev_state *ds,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-	    (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
-	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
-	list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
-	list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
-		struct btrfsic_dev_state_hashtable *h)
-{
-	const unsigned int hashval =
-		dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
-	struct btrfsic_dev_state *ds;
-
-	list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
-		if (ds->bdev->bd_dev == dev)
-			return ds;
-	}
-
-	return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
-				      struct btrfs_fs_devices *fs_devices)
-{
-	struct btrfs_super_block *selected_super;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-	struct btrfsic_dev_state *selected_dev_state = NULL;
-	int ret = 0;
-	int pass;
-
-	selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
-	if (!selected_super)
-		return -ENOMEM;
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		int i;
-		struct btrfsic_dev_state *dev_state;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
-		BUG_ON(NULL == dev_state);
-		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-			ret = btrfsic_process_superblock_dev_mirror(
-					state, dev_state, device, i,
-					&selected_dev_state, selected_super);
-			if (0 != ret && 0 == i) {
-				kfree(selected_super);
-				return ret;
-			}
-		}
-	}
-
-	if (NULL == state->latest_superblock) {
-		pr_info("btrfsic: no superblock found!\n");
-		kfree(selected_super);
-		return -1;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		int num_copies;
-		int mirror_num;
-		u64 next_bytenr;
-
-		switch (pass) {
-		case 0:
-			next_bytenr = btrfs_super_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("root@%llu\n", next_bytenr);
-			break;
-		case 1:
-			next_bytenr = btrfs_super_chunk_root(selected_super);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("chunk@%llu\n", next_bytenr);
-			break;
-		case 2:
-			next_bytenr = btrfs_super_log_root(selected_super);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("log@%llu\n", next_bytenr);
-			break;
-		}
-
-		num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
-					      state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-
-			ret = btrfsic_map_block(state, next_bytenr,
-						state->metablock_size,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
-				       next_bytenr, mirror_num);
-				kfree(selected_super);
-				return -1;
-			}
-
-			next_block = btrfsic_block_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					&state->block_hashtable);
-			BUG_ON(NULL == next_block);
-
-			l = btrfsic_block_link_hashtable_lookup(
-					tmp_next_block_ctx.dev->bdev,
-					tmp_next_block_ctx.dev_bytenr,
-					state->latest_superblock->dev_state->
-					bdev,
-					state->latest_superblock->dev_bytenr,
-					&state->block_link_hashtable);
-			BUG_ON(NULL == l);
-
-			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-			if (ret < (int)PAGE_SIZE) {
-				pr_info("btrfsic: read @logical %llu failed!\n",
-				       tmp_next_block_ctx.start);
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				kfree(selected_super);
-				return -1;
-			}
-
-			ret = btrfsic_process_metablock(state,
-							next_block,
-							&tmp_next_block_ctx,
-							BTRFS_MAX_LEVEL + 3, 1);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-		}
-	}
-
-	kfree(selected_super);
-	return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
-		struct btrfsic_state *state,
-		struct btrfsic_dev_state *dev_state,
-		struct btrfs_device *device,
-		int superblock_mirror_num,
-		struct btrfsic_dev_state **selected_dev_state,
-		struct btrfs_super_block *selected_super)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfs_super_block *super_tmp;
-	u64 dev_bytenr;
-	struct btrfsic_block *superblock_tmp;
-	int pass;
-	struct block_device *const superblock_bdev = device->bdev;
-	struct page *page;
-	struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
-	int ret = 0;
-
-	/* super block bytenr is always the unmapped device bytenr */
-	dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-	if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
-		return -1;
-
-	page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
-	if (IS_ERR(page))
-		return -1;
-
-	super_tmp = page_address(page);
-
-	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
-	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
-	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
-	    btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
-		ret = 0;
-		goto out;
-	}
-
-	superblock_tmp =
-	    btrfsic_block_hashtable_lookup(superblock_bdev,
-					   dev_bytenr,
-					   &state->block_hashtable);
-	if (NULL == superblock_tmp) {
-		superblock_tmp = btrfsic_block_alloc();
-		if (NULL == superblock_tmp) {
-			ret = -1;
-			goto out;
-		}
-		/* for superblock, only the dev_bytenr makes sense */
-		superblock_tmp->dev_bytenr = dev_bytenr;
-		superblock_tmp->dev_state = dev_state;
-		superblock_tmp->logical_bytenr = dev_bytenr;
-		superblock_tmp->generation = btrfs_super_generation(super_tmp);
-		superblock_tmp->is_metadata = 1;
-		superblock_tmp->is_superblock = 1;
-		superblock_tmp->is_iodone = 1;
-		superblock_tmp->never_written = 0;
-		superblock_tmp->mirror_num = 1 + superblock_mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			btrfs_info_in_rcu(fs_info,
-			"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
-				     superblock_bdev,
-				     btrfs_dev_name(device), dev_bytenr,
-				     dev_state->bdev, dev_bytenr,
-				     superblock_mirror_num);
-		list_add(&superblock_tmp->all_blocks_node,
-			 &state->all_blocks_list);
-		btrfsic_block_hashtable_add(superblock_tmp,
-					    &state->block_hashtable);
-	}
-
-	/* select the one with the highest generation field */
-	if (btrfs_super_generation(super_tmp) >
-	    state->max_superblock_generation ||
-	    0 == state->max_superblock_generation) {
-		memcpy(selected_super, super_tmp, sizeof(*selected_super));
-		*selected_dev_state = dev_state;
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_tmp);
-		state->latest_superblock = superblock_tmp;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		u64 next_bytenr;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key;
-
-		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
-		tmp_disk_key.offset = 0;
-		switch (pass) {
-		case 0:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "initial root ";
-			next_bytenr = btrfs_super_root(super_tmp);
-			break;
-		case 1:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "initial chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_tmp);
-			break;
-		case 2:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "initial log ";
-			next_bytenr = btrfs_super_log_root(super_tmp);
-			if (0 == next_bytenr)
-				continue;
-			break;
-		}
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block *next_block;
-			struct btrfsic_block_data_ctx tmp_next_block_ctx;
-			struct btrfsic_block_link *l;
-
-			if (btrfsic_map_block(state, next_bytenr,
-					      state->metablock_size,
-					      &tmp_next_block_ctx,
-					      mirror_num)) {
-				pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
-				       next_bytenr, mirror_num);
-				ret = -1;
-				goto out;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					additional_string, 1, 1, 0,
-					mirror_num, NULL);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				ret = -1;
-				goto out;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state, &tmp_next_block_ctx,
-					next_block, superblock_tmp,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l) {
-				ret = -1;
-				goto out;
-			}
-		}
-	}
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
-		btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-out:
-	put_page(page);
-	return ret;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
-	struct btrfsic_stack_frame *sf;
-
-	sf = kzalloc(sizeof(*sf), GFP_NOFS);
-	if (sf)
-		sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
-	return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
-	BUG_ON(!(NULL == sf ||
-		 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
-	kfree(sf);
-}
-
-static noinline_for_stack int btrfsic_process_metablock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const first_block,
-		struct btrfsic_block_data_ctx *const first_block_ctx,
-		int first_limit_nesting, int force_iodone_flag)
-{
-	struct btrfsic_stack_frame initial_stack_frame = { 0 };
-	struct btrfsic_stack_frame *sf;
-	struct btrfsic_stack_frame *next_stack;
-	struct btrfs_header *const first_hdr =
-		(struct btrfs_header *)first_block_ctx->datav[0];
-
-	BUG_ON(!first_hdr);
-	sf = &initial_stack_frame;
-	sf->error = 0;
-	sf->i = -1;
-	sf->limit_nesting = first_limit_nesting;
-	sf->block = first_block;
-	sf->block_ctx = first_block_ctx;
-	sf->next_block = NULL;
-	sf->hdr = first_hdr;
-	sf->prev = NULL;
-
-continue_with_new_stack_frame:
-	sf->block->generation = btrfs_stack_header_generation(sf->hdr);
-	if (0 == sf->hdr->level) {
-		struct btrfs_leaf *const leafhdr =
-		    (struct btrfs_leaf *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("leaf %llu items %d generation %llu owner %llu\n",
-				       sf->block_ctx->start, sf->nr,
-				       btrfs_stack_header_generation(
-					       &leafhdr->header),
-				       btrfs_stack_header_owner(
-					       &leafhdr->header));
-		}
-
-continue_with_current_leaf_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_item disk_item;
-			u32 disk_item_offset =
-				(uintptr_t)(leafhdr->items + sf->i) -
-				(uintptr_t)leafhdr;
-			struct btrfs_disk_key *disk_key;
-			u8 type;
-			u32 item_offset;
-			u32 item_size;
-
-			if (disk_item_offset + sizeof(struct btrfs_item) >
-			    sf->block_ctx->len) {
-leaf_item_out_of_bounce_error:
-				pr_info(
-		"btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
-				       sf->block_ctx->start,
-				       sf->block_ctx->dev->bdev);
-				goto one_stack_frame_backwards;
-			}
-			btrfsic_read_from_block_data(sf->block_ctx,
-						     &disk_item,
-						     disk_item_offset,
-						     sizeof(struct btrfs_item));
-			item_offset = btrfs_stack_item_offset(&disk_item);
-			item_size = btrfs_stack_item_size(&disk_item);
-			disk_key = &disk_item.key;
-			type = btrfs_disk_key_type(disk_key);
-
-			if (BTRFS_ROOT_ITEM_KEY == type) {
-				struct btrfs_root_item root_item;
-				u32 root_item_offset;
-				u64 next_bytenr;
-
-				root_item_offset = item_offset +
-					offsetof(struct btrfs_leaf, items);
-				if (root_item_offset + item_size >
-				    sf->block_ctx->len)
-					goto leaf_item_out_of_bounce_error;
-				btrfsic_read_from_block_data(
-					sf->block_ctx, &root_item,
-					root_item_offset,
-					item_size);
-				next_bytenr = btrfs_root_bytenr(&root_item);
-
-				sf->error =
-				    btrfsic_create_link_to_next_block(
-						state,
-						sf->block,
-						sf->block_ctx,
-						next_bytenr,
-						sf->limit_nesting,
-						&sf->next_block_ctx,
-						&sf->next_block,
-						force_iodone_flag,
-						&sf->num_copies,
-						&sf->mirror_num,
-						disk_key,
-						btrfs_root_generation(
-						&root_item));
-				if (sf->error)
-					goto one_stack_frame_backwards;
-
-				if (NULL != sf->next_block) {
-					struct btrfs_header *const next_hdr =
-					    (struct btrfs_header *)
-					    sf->next_block_ctx.datav[0];
-
-					next_stack =
-					    btrfsic_stack_frame_alloc();
-					if (NULL == next_stack) {
-						sf->error = -1;
-						btrfsic_release_block_ctx(
-								&sf->
-								next_block_ctx);
-						goto one_stack_frame_backwards;
-					}
-
-					next_stack->i = -1;
-					next_stack->block = sf->next_block;
-					next_stack->block_ctx =
-					    &sf->next_block_ctx;
-					next_stack->next_block = NULL;
-					next_stack->hdr = next_hdr;
-					next_stack->limit_nesting =
-					    sf->limit_nesting - 1;
-					next_stack->prev = sf;
-					sf = next_stack;
-					goto continue_with_new_stack_frame;
-				}
-			} else if (BTRFS_EXTENT_DATA_KEY == type &&
-				   state->include_extent_data) {
-				sf->error = btrfsic_handle_extent_data(
-						state,
-						sf->block,
-						sf->block_ctx,
-						item_offset,
-						force_iodone_flag);
-				if (sf->error)
-					goto one_stack_frame_backwards;
-			}
-
-			goto continue_with_current_leaf_stack_frame;
-		}
-	} else {
-		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
-		if (-1 == sf->i) {
-			sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("node %llu level %d items %d generation %llu owner %llu\n",
-				       sf->block_ctx->start,
-				       nodehdr->header.level, sf->nr,
-				       btrfs_stack_header_generation(
-				       &nodehdr->header),
-				       btrfs_stack_header_owner(
-				       &nodehdr->header));
-		}
-
-continue_with_current_node_stack_frame:
-		if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
-			sf->i++;
-			sf->num_copies = 0;
-		}
-
-		if (sf->i < sf->nr) {
-			struct btrfs_key_ptr key_ptr;
-			u32 key_ptr_offset;
-			u64 next_bytenr;
-
-			key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
-					  (uintptr_t)nodehdr;
-			if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
-			    sf->block_ctx->len) {
-				pr_info(
-		"btrfsic: node item out of bounce at logical %llu, dev %pg\n",
-				       sf->block_ctx->start,
-				       sf->block_ctx->dev->bdev);
-				goto one_stack_frame_backwards;
-			}
-			btrfsic_read_from_block_data(
-				sf->block_ctx, &key_ptr, key_ptr_offset,
-				sizeof(struct btrfs_key_ptr));
-			next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
-
-			sf->error = btrfsic_create_link_to_next_block(
-					state,
-					sf->block,
-					sf->block_ctx,
-					next_bytenr,
-					sf->limit_nesting,
-					&sf->next_block_ctx,
-					&sf->next_block,
-					force_iodone_flag,
-					&sf->num_copies,
-					&sf->mirror_num,
-					&key_ptr.key,
-					btrfs_stack_key_generation(&key_ptr));
-			if (sf->error)
-				goto one_stack_frame_backwards;
-
-			if (NULL != sf->next_block) {
-				struct btrfs_header *const next_hdr =
-				    (struct btrfs_header *)
-				    sf->next_block_ctx.datav[0];
-
-				next_stack = btrfsic_stack_frame_alloc();
-				if (NULL == next_stack) {
-					sf->error = -1;
-					goto one_stack_frame_backwards;
-				}
-
-				next_stack->i = -1;
-				next_stack->block = sf->next_block;
-				next_stack->block_ctx = &sf->next_block_ctx;
-				next_stack->next_block = NULL;
-				next_stack->hdr = next_hdr;
-				next_stack->limit_nesting =
-				    sf->limit_nesting - 1;
-				next_stack->prev = sf;
-				sf = next_stack;
-				goto continue_with_new_stack_frame;
-			}
-
-			goto continue_with_current_node_stack_frame;
-		}
-	}
-
-one_stack_frame_backwards:
-	if (NULL != sf->prev) {
-		struct btrfsic_stack_frame *const prev = sf->prev;
-
-		/* the one for the initial block is freed in the caller */
-		btrfsic_release_block_ctx(sf->block_ctx);
-
-		if (sf->error) {
-			prev->error = sf->error;
-			btrfsic_stack_frame_free(sf);
-			sf = prev;
-			goto one_stack_frame_backwards;
-		}
-
-		btrfsic_stack_frame_free(sf);
-		sf = prev;
-		goto continue_with_new_stack_frame;
-	} else {
-		BUG_ON(&initial_stack_frame != sf);
-	}
-
-	return sf->error;
-}
-
-static void btrfsic_read_from_block_data(
-	struct btrfsic_block_data_ctx *block_ctx,
-	void *dstv, u32 offset, size_t len)
-{
-	size_t cur;
-	size_t pgoff;
-	char *kaddr;
-	char *dst = (char *)dstv;
-	size_t start_offset = offset_in_page(block_ctx->start);
-	unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
-
-	WARN_ON(offset + len > block_ctx->len);
-	pgoff = offset_in_page(start_offset + offset);
-
-	while (len > 0) {
-		cur = min(len, ((size_t)PAGE_SIZE - pgoff));
-		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
-		kaddr = block_ctx->datav[i];
-		memcpy(dst, kaddr + pgoff, cur);
-
-		dst += cur;
-		len -= cur;
-		pgoff = 0;
-		i++;
-	}
-}
-
-static int btrfsic_create_link_to_next_block(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u64 next_bytenr,
-		int limit_nesting,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block **next_blockp,
-		int force_iodone_flag,
-		int *num_copiesp, int *mirror_nump,
-		struct btrfs_disk_key *disk_key,
-		u64 parent_generation)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfsic_block *next_block = NULL;
-	int ret;
-	struct btrfsic_block_link *l;
-	int did_alloc_block_link;
-	int block_was_created;
-
-	*next_blockp = NULL;
-	if (0 == *num_copiesp) {
-		*num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
-						state->metablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, *num_copiesp);
-		*mirror_nump = 1;
-	}
-
-	if (*mirror_nump > *num_copiesp)
-		return 0;
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-		pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
-		       *mirror_nump);
-	ret = btrfsic_map_block(state, next_bytenr,
-				state->metablock_size,
-				next_block_ctx, *mirror_nump);
-	if (ret) {
-		pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-		       next_bytenr, *mirror_nump);
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-
-	next_block = btrfsic_block_lookup_or_add(state,
-						 next_block_ctx, "referenced ",
-						 1, force_iodone_flag,
-						 !force_iodone_flag,
-						 *mirror_nump,
-						 &block_was_created);
-	if (NULL == next_block) {
-		btrfsic_release_block_ctx(next_block_ctx);
-		*next_blockp = NULL;
-		return -1;
-	}
-	if (block_was_created) {
-		l = NULL;
-		next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
-	} else {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-			if (next_block->logical_bytenr != next_bytenr &&
-			    !(!next_block->is_metadata &&
-			      0 == next_block->logical_bytenr))
-				pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
-				       next_bytenr, next_block_ctx->dev->bdev,
-				       next_block_ctx->dev_bytenr, *mirror_nump,
-				       btrfsic_get_block_type(state,
-							      next_block),
-				       next_block->logical_bytenr);
-			else
-				pr_info(
-		"referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-				       next_bytenr, next_block_ctx->dev->bdev,
-				       next_block_ctx->dev_bytenr, *mirror_nump,
-				       btrfsic_get_block_type(state,
-							      next_block));
-		}
-		next_block->logical_bytenr = next_bytenr;
-
-		next_block->mirror_num = *mirror_nump;
-		l = btrfsic_block_link_hashtable_lookup(
-				next_block_ctx->dev->bdev,
-				next_block_ctx->dev_bytenr,
-				block_ctx->dev->bdev,
-				block_ctx->dev_bytenr,
-				&state->block_link_hashtable);
-	}
-
-	next_block->disk_key = *disk_key;
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (NULL == l) {
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		did_alloc_block_link = 1;
-		l->block_ref_to = next_block;
-		l->block_ref_from = block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		did_alloc_block_link = 0;
-		if (0 == limit_nesting) {
-			l->ref_cnt++;
-			l->parent_generation = parent_generation;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_add_link(state, l);
-		}
-	}
-
-	if (limit_nesting > 0 && did_alloc_block_link) {
-		ret = btrfsic_read_block(state, next_block_ctx);
-		if (ret < (int)next_block_ctx->len) {
-			pr_info("btrfsic: read block @logical %llu failed!\n",
-			       next_bytenr);
-			btrfsic_release_block_ctx(next_block_ctx);
-			*next_blockp = NULL;
-			return -1;
-		}
-
-		*next_blockp = next_block;
-	} else {
-		*next_blockp = NULL;
-	}
-	(*mirror_nump)++;
-
-	return 0;
-}
-
-static int btrfsic_handle_extent_data(
-		struct btrfsic_state *state,
-		struct btrfsic_block *block,
-		struct btrfsic_block_data_ctx *block_ctx,
-		u32 item_offset, int force_iodone_flag)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfs_file_extent_item file_extent_item;
-	u64 file_extent_item_offset;
-	u64 next_bytenr;
-	u64 num_bytes;
-	u64 generation;
-	struct btrfsic_block_link *l;
-	int ret;
-
-	file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
-				  item_offset;
-	if (file_extent_item_offset +
-	    offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
-	    block_ctx->len) {
-		pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
-		       block_ctx->start, block_ctx->dev->bdev);
-		return -1;
-	}
-
-	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
-		file_extent_item_offset,
-		offsetof(struct btrfs_file_extent_item, disk_num_bytes));
-	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
-	    btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-			pr_info("extent_data: type %u, disk_bytenr = %llu\n",
-			       file_extent_item.type,
-			       btrfs_stack_file_extent_disk_bytenr(
-			       &file_extent_item));
-		return 0;
-	}
-
-	if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
-	    block_ctx->len) {
-		pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
-		       block_ctx->start, block_ctx->dev->bdev);
-		return -1;
-	}
-	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
-				     file_extent_item_offset,
-				     sizeof(struct btrfs_file_extent_item));
-	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
-	if (btrfs_stack_file_extent_compression(&file_extent_item) ==
-	    BTRFS_COMPRESS_NONE) {
-		next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
-		num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
-	} else {
-		num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
-	}
-	generation = btrfs_stack_file_extent_generation(&file_extent_item);
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-		pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
-		       file_extent_item.type,
-		       btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
-		       btrfs_stack_file_extent_offset(&file_extent_item),
-		       num_bytes);
-	while (num_bytes > 0) {
-		u32 chunk_len;
-		int num_copies;
-		int mirror_num;
-
-		if (num_bytes > state->datablock_size)
-			chunk_len = state->datablock_size;
-		else
-			chunk_len = num_bytes;
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      state->datablock_size);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			struct btrfsic_block_data_ctx next_block_ctx;
-			struct btrfsic_block *next_block;
-			int block_was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
-					mirror_num);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
-				pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
-				       next_bytenr, chunk_len);
-			ret = btrfsic_map_block(state, next_bytenr,
-						chunk_len, &next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-				       next_bytenr, mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&next_block_ctx,
-					"referenced ",
-					0,
-					force_iodone_flag,
-					!force_iodone_flag,
-					mirror_num,
-					&block_was_created);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&next_block_ctx);
-				return -1;
-			}
-			if (!block_was_created) {
-				if ((state->print_mask &
-				     BTRFSIC_PRINT_MASK_VERBOSE) &&
-				    next_block->logical_bytenr != next_bytenr &&
-				    !(!next_block->is_metadata &&
-				      0 == next_block->logical_bytenr)) {
-					pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
-					       next_bytenr,
-					       next_block_ctx.dev->bdev,
-					       next_block_ctx.dev_bytenr,
-					       mirror_num,
-					       next_block->logical_bytenr);
-				}
-				next_block->logical_bytenr = next_bytenr;
-				next_block->mirror_num = mirror_num;
-			}
-
-			l = btrfsic_block_link_lookup_or_add(state,
-							     &next_block_ctx,
-							     next_block, block,
-							     generation);
-			btrfsic_release_block_ctx(&next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-
-		next_bytenr += chunk_len;
-		num_bytes -= chunk_len;
-	}
-
-	return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
-			     struct btrfsic_block_data_ctx *block_ctx_out,
-			     int mirror_num)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	int ret;
-	u64 length;
-	struct btrfs_io_context *bioc = NULL;
-	struct btrfs_io_stripe smap, *map;
-	struct btrfs_device *device;
-
-	length = len;
-	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
-			      NULL, &mirror_num, 0);
-	if (ret) {
-		block_ctx_out->start = 0;
-		block_ctx_out->dev_bytenr = 0;
-		block_ctx_out->len = 0;
-		block_ctx_out->dev = NULL;
-		block_ctx_out->datav = NULL;
-		block_ctx_out->pagev = NULL;
-		block_ctx_out->mem_to_free = NULL;
-
-		return ret;
-	}
-
-	if (bioc)
-		map = &bioc->stripes[0];
-	else
-		map = &smap;
-
-	device = map->dev;
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
-	    !device->bdev || !device->name)
-		block_ctx_out->dev = NULL;
-	else
-		block_ctx_out->dev = btrfsic_dev_state_lookup(
-							device->bdev->bd_dev);
-	block_ctx_out->dev_bytenr = map->physical;
-	block_ctx_out->start = bytenr;
-	block_ctx_out->len = len;
-	block_ctx_out->datav = NULL;
-	block_ctx_out->pagev = NULL;
-	block_ctx_out->mem_to_free = NULL;
-
-	kfree(bioc);
-	if (NULL == block_ctx_out->dev) {
-		ret = -ENXIO;
-		pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
-	}
-
-	return ret;
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
-	if (block_ctx->mem_to_free) {
-		unsigned int num_pages;
-
-		BUG_ON(!block_ctx->datav);
-		BUG_ON(!block_ctx->pagev);
-		num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
-			    PAGE_SHIFT;
-		/* Pages must be unmapped in reverse order */
-		while (num_pages > 0) {
-			num_pages--;
-			if (block_ctx->datav[num_pages])
-				block_ctx->datav[num_pages] = NULL;
-			if (block_ctx->pagev[num_pages]) {
-				__free_page(block_ctx->pagev[num_pages]);
-				block_ctx->pagev[num_pages] = NULL;
-			}
-		}
-
-		kfree(block_ctx->mem_to_free);
-		block_ctx->mem_to_free = NULL;
-		block_ctx->pagev = NULL;
-		block_ctx->datav = NULL;
-	}
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
-			      struct btrfsic_block_data_ctx *block_ctx)
-{
-	unsigned int num_pages;
-	unsigned int i;
-	size_t size;
-	u64 dev_bytenr;
-	int ret;
-
-	BUG_ON(block_ctx->datav);
-	BUG_ON(block_ctx->pagev);
-	BUG_ON(block_ctx->mem_to_free);
-	if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
-		pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
-		       block_ctx->dev_bytenr);
-		return -1;
-	}
-
-	num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
-		    PAGE_SHIFT;
-	size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
-	block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
-	if (!block_ctx->mem_to_free)
-		return -ENOMEM;
-	block_ctx->datav = block_ctx->mem_to_free;
-	block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
-	ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
-	if (ret)
-		return ret;
-
-	dev_bytenr = block_ctx->dev_bytenr;
-	for (i = 0; i < num_pages;) {
-		struct bio *bio;
-		unsigned int j;
-
-		bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
-				REQ_OP_READ, GFP_NOFS);
-		bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
-
-		for (j = i; j < num_pages; j++) {
-			ret = bio_add_page(bio, block_ctx->pagev[j],
-					   PAGE_SIZE, 0);
-			if (PAGE_SIZE != ret)
-				break;
-		}
-		if (j == i) {
-			pr_info("btrfsic: error, failed to add a single page!\n");
-			return -1;
-		}
-		if (submit_bio_wait(bio)) {
-			pr_info("btrfsic: read error at logical %llu dev %pg!\n",
-			       block_ctx->start, block_ctx->dev->bdev);
-			bio_put(bio);
-			return -1;
-		}
-		bio_put(bio);
-		dev_bytenr += (j - i) * PAGE_SIZE;
-		i = j;
-	}
-	for (i = 0; i < num_pages; i++)
-		block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
-
-	return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
-	const struct btrfsic_block *b_all;
-
-	BUG_ON(NULL == state);
-
-	pr_info("all_blocks_list:\n");
-	list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
-		const struct btrfsic_block_link *l;
-
-		pr_info("%c-block @%llu (%pg/%llu/%d)\n",
-		       btrfsic_get_block_type(state, b_all),
-		       b_all->logical_bytenr, b_all->dev_state->bdev,
-		       b_all->dev_bytenr, b_all->mirror_num);
-
-		list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
-			pr_info(
-		" %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		}
-
-		list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
-			pr_info(
-		" %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->bdev,
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		}
-
-		pr_info("\n");
-	}
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static noinline_for_stack int btrfsic_test_for_metadata(
-		struct btrfsic_state *state,
-		char **datav, unsigned int num_pages)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	struct btrfs_header *h;
-	u8 csum[BTRFS_CSUM_SIZE];
-	unsigned int i;
-
-	if (num_pages * PAGE_SIZE < state->metablock_size)
-		return 1; /* not metadata */
-	num_pages = state->metablock_size >> PAGE_SHIFT;
-	h = (struct btrfs_header *)datav[0];
-
-	if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
-		return 1;
-
-	shash->tfm = fs_info->csum_shash;
-	crypto_shash_init(shash);
-
-	for (i = 0; i < num_pages; i++) {
-		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
-		size_t sublen = i ? PAGE_SIZE :
-				    (PAGE_SIZE - BTRFS_CSUM_SIZE);
-
-		crypto_shash_update(shash, data, sublen);
-	}
-	crypto_shash_final(shash, csum);
-	if (memcmp(csum, h->csum, fs_info->csum_size))
-		return 1;
-
-	return 0; /* is metadata */
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-					  u64 dev_bytenr, char **mapped_datav,
-					  unsigned int num_pages,
-					  struct bio *bio, int *bio_is_patched,
-					  blk_opf_t submit_bio_bh_rw)
-{
-	int is_metadata;
-	struct btrfsic_block *block;
-	struct btrfsic_block_data_ctx block_ctx;
-	int ret;
-	struct btrfsic_state *state = dev_state->state;
-	struct block_device *bdev = dev_state->bdev;
-	unsigned int processed_len;
-
-	if (NULL != bio_is_patched)
-		*bio_is_patched = 0;
-
-again:
-	if (num_pages == 0)
-		return;
-
-	processed_len = 0;
-	is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
-						      num_pages));
-
-	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL != block) {
-		u64 bytenr = 0;
-		struct btrfsic_block_link *l, *tmp;
-
-		if (block->is_superblock) {
-			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
-						    mapped_datav[0]);
-			if (num_pages * PAGE_SIZE <
-			    BTRFS_SUPER_INFO_SIZE) {
-				pr_info("btrfsic: cannot work with too short bios!\n");
-				return;
-			}
-			is_metadata = 1;
-			BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
-			processed_len = BTRFS_SUPER_INFO_SIZE;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
-				pr_info("[before new superblock is written]:\n");
-				btrfsic_dump_tree_sub(state, block, 0);
-			}
-		}
-		if (is_metadata) {
-			if (!block->is_superblock) {
-				if (num_pages * PAGE_SIZE <
-				    state->metablock_size) {
-					pr_info("btrfsic: cannot work with too short bios!\n");
-					return;
-				}
-				processed_len = state->metablock_size;
-				bytenr = btrfs_stack_header_bytenr(
-						(struct btrfs_header *)
-						mapped_datav[0]);
-				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
-							       dev_state,
-							       dev_bytenr);
-			}
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
-				if (block->logical_bytenr != bytenr &&
-				    !(!block->is_metadata &&
-				      block->logical_bytenr == 0))
-					pr_info(
-"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
-					       bytenr, dev_state->bdev,
-					       dev_bytenr,
-					       block->mirror_num,
-					       btrfsic_get_block_type(state,
-								      block),
-					       block->logical_bytenr);
-				else
-					pr_info(
-		"written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-					       bytenr, dev_state->bdev,
-					       dev_bytenr, block->mirror_num,
-					       btrfsic_get_block_type(state,
-								      block));
-			}
-			block->logical_bytenr = bytenr;
-		} else {
-			if (num_pages * PAGE_SIZE <
-			    state->datablock_size) {
-				pr_info("btrfsic: cannot work with too short bios!\n");
-				return;
-			}
-			processed_len = state->datablock_size;
-			bytenr = block->logical_bytenr;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-		"written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
-				       bytenr, dev_state->bdev, dev_bytenr,
-				       block->mirror_num,
-				       btrfsic_get_block_type(state, block));
-		}
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
-			       list_empty(&block->ref_to_list) ? ' ' : '!',
-			       list_empty(&block->ref_from_list) ? ' ' : '!');
-		if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
-			pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
-			       btrfsic_get_block_type(state, block), bytenr,
-			       dev_state->bdev, dev_bytenr, block->mirror_num,
-			       block->generation,
-			       btrfs_disk_key_objectid(&block->disk_key),
-			       block->disk_key.type,
-			       btrfs_disk_key_offset(&block->disk_key),
-			       btrfs_stack_header_generation(
-				       (struct btrfs_header *) mapped_datav[0]),
-			       state->max_superblock_generation);
-			btrfsic_dump_tree(state);
-		}
-
-		if (!block->is_iodone && !block->never_written) {
-			pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, block), bytenr,
-			       dev_state->bdev, dev_bytenr, block->mirror_num,
-			       block->generation,
-			       btrfs_stack_header_generation(
-				       (struct btrfs_header *)
-				       mapped_datav[0]));
-			/* it would not be safe to go on */
-			btrfsic_dump_tree(state);
-			goto continue_loop;
-		}
-
-		/*
-		 * Clear all references of this block. Do not free
-		 * the block itself even if is not referenced anymore
-		 * because it still carries valuable information
-		 * like whether it was ever written and IO completed.
-		 */
-		list_for_each_entry_safe(l, tmp, &block->ref_to_list,
-					 node_ref_to) {
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-			l->ref_cnt--;
-			if (0 == l->ref_cnt) {
-				list_del(&l->node_ref_to);
-				list_del(&l->node_ref_from);
-				btrfsic_block_link_hashtable_remove(l);
-				btrfsic_block_link_free(l);
-			}
-		}
-
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-		block_ctx.start = bytenr;
-		block_ctx.len = processed_len;
-		block_ctx.pagev = NULL;
-		block_ctx.mem_to_free = NULL;
-		block_ctx.datav = mapped_datav;
-
-		if (is_metadata || state->include_extent_data) {
-			block->never_written = 0;
-			block->iodone_w_error = 0;
-			if (NULL != bio) {
-				block->is_iodone = 0;
-				BUG_ON(NULL == bio_is_patched);
-				if (!*bio_is_patched) {
-					block->orig_bio_private =
-					    bio->bi_private;
-					block->orig_bio_end_io =
-					    bio->bi_end_io;
-					block->next_in_same_bio = NULL;
-					bio->bi_private = block;
-					bio->bi_end_io = btrfsic_bio_end_io;
-					*bio_is_patched = 1;
-				} else {
-					struct btrfsic_block *chained_block =
-					    (struct btrfsic_block *)
-					    bio->bi_private;
-
-					BUG_ON(NULL == chained_block);
-					block->orig_bio_private =
-					    chained_block->orig_bio_private;
-					block->orig_bio_end_io =
-					    chained_block->orig_bio_end_io;
-					block->next_in_same_bio = chained_block;
-					bio->bi_private = block;
-				}
-			} else {
-				block->is_iodone = 1;
-				block->orig_bio_private = NULL;
-				block->orig_bio_end_io = NULL;
-				block->next_in_same_bio = NULL;
-			}
-		}
-
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (is_metadata) {
-			block->logical_bytenr = bytenr;
-			block->is_metadata = 1;
-			if (block->is_superblock) {
-				BUG_ON(PAGE_SIZE !=
-				       BTRFS_SUPER_INFO_SIZE);
-				ret = btrfsic_process_written_superblock(
-						state,
-						block,
-						(struct btrfs_super_block *)
-						mapped_datav[0]);
-				if (state->print_mask &
-				    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
-					pr_info("[after new superblock is written]:\n");
-					btrfsic_dump_tree_sub(state, block, 0);
-				}
-			} else {
-				block->mirror_num = 0;	/* unknown */
-				ret = btrfsic_process_metablock(
-						state,
-						block,
-						&block_ctx,
-						0, 0);
-			}
-			if (ret)
-				pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
-				       dev_bytenr);
-		} else {
-			block->is_metadata = 0;
-			block->mirror_num = 0;	/* unknown */
-			block->generation = BTRFSIC_GENERATION_UNKNOWN;
-			if (!state->include_extent_data
-			    && list_empty(&block->ref_from_list)) {
-				/*
-				 * disk block is overwritten with extent
-				 * data (not meta data) and we are configured
-				 * to not include extent data: take the
-				 * chance and free the block's memory
-				 */
-				btrfsic_block_hashtable_remove(block);
-				list_del(&block->all_blocks_node);
-				btrfsic_block_free(block);
-			}
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	} else {
-		/* block has not been found in hash table */
-		u64 bytenr;
-
-		if (!is_metadata) {
-			processed_len = state->datablock_size;
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-			"written block (%pg/%llu/?) !found in hash table, D\n",
-				       dev_state->bdev, dev_bytenr);
-			if (!state->include_extent_data) {
-				/* ignore that written D block */
-				goto continue_loop;
-			}
-
-			/* this is getting ugly for the
-			 * include_extent_data case... */
-			bytenr = 0;	/* unknown */
-		} else {
-			processed_len = state->metablock_size;
-			bytenr = btrfs_stack_header_bytenr(
-					(struct btrfs_header *)
-					mapped_datav[0]);
-			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-						       dev_bytenr);
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info(
-			"written block @%llu (%pg/%llu/?) !found in hash table, M\n",
-				       bytenr, dev_state->bdev, dev_bytenr);
-		}
-
-		block_ctx.dev = dev_state;
-		block_ctx.dev_bytenr = dev_bytenr;
-		block_ctx.start = bytenr;
-		block_ctx.len = processed_len;
-		block_ctx.pagev = NULL;
-		block_ctx.mem_to_free = NULL;
-		block_ctx.datav = mapped_datav;
-
-		block = btrfsic_block_alloc();
-		if (NULL == block) {
-			btrfsic_release_block_ctx(&block_ctx);
-			goto continue_loop;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = dev_bytenr;
-		block->logical_bytenr = bytenr;
-		block->is_metadata = is_metadata;
-		block->never_written = 0;
-		block->iodone_w_error = 0;
-		block->mirror_num = 0;	/* unknown */
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = submit_bio_bh_rw;
-		if (NULL != bio) {
-			block->is_iodone = 0;
-			BUG_ON(NULL == bio_is_patched);
-			if (!*bio_is_patched) {
-				block->orig_bio_private = bio->bi_private;
-				block->orig_bio_end_io = bio->bi_end_io;
-				block->next_in_same_bio = NULL;
-				bio->bi_private = block;
-				bio->bi_end_io = btrfsic_bio_end_io;
-				*bio_is_patched = 1;
-			} else {
-				struct btrfsic_block *chained_block =
-				    (struct btrfsic_block *)
-				    bio->bi_private;
-
-				BUG_ON(NULL == chained_block);
-				block->orig_bio_private =
-				    chained_block->orig_bio_private;
-				block->orig_bio_end_io =
-				    chained_block->orig_bio_end_io;
-				block->next_in_same_bio = chained_block;
-				bio->bi_private = block;
-			}
-		} else {
-			block->is_iodone = 1;
-			block->orig_bio_private = NULL;
-			block->orig_bio_end_io = NULL;
-			block->next_in_same_bio = NULL;
-		}
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
-			       is_metadata ? 'M' : 'D',
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
-		if (is_metadata) {
-			ret = btrfsic_process_metablock(state, block,
-							&block_ctx, 0, 0);
-			if (ret)
-				pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
-				       dev_bytenr);
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-
-continue_loop:
-	BUG_ON(!processed_len);
-	dev_bytenr += processed_len;
-	mapped_datav += processed_len >> PAGE_SHIFT;
-	num_pages -= processed_len >> PAGE_SHIFT;
-	goto again;
-}
-
-static void btrfsic_bio_end_io(struct bio *bp)
-{
-	struct btrfsic_block *block = bp->bi_private;
-	int iodone_w_error;
-
-	/* mutex is not held! This is not save if IO is not yet completed
-	 * on umount */
-	iodone_w_error = 0;
-	if (bp->bi_status)
-		iodone_w_error = 1;
-
-	BUG_ON(NULL == block);
-	bp->bi_private = block->orig_bio_private;
-	bp->bi_end_io = block->orig_bio_end_io;
-
-	do {
-		struct btrfsic_block *next_block;
-		struct btrfsic_dev_state *const dev_state = block->dev_state;
-
-		if ((dev_state->state->print_mask &
-		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-			pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
-			       bp->bi_status,
-			       btrfsic_get_block_type(dev_state->state, block),
-			       block->logical_bytenr, dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num);
-		next_block = block->next_in_same_bio;
-		block->iodone_w_error = iodone_w_error;
-		if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
-			dev_state->last_flush_gen++;
-			if ((dev_state->state->print_mask &
-			     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-				pr_info("bio_end_io() new %pg flush_gen=%llu\n",
-				       dev_state->bdev,
-				       dev_state->last_flush_gen);
-		}
-		if (block->submit_bio_bh_rw & REQ_FUA)
-			block->flush_gen = 0; /* FUA completed means block is
-					       * on disk */
-		block->is_iodone = 1; /* for FLUSH, this releases the block */
-		block = next_block;
-	} while (NULL != block);
-
-	bp->bi_end_io(bp);
-}
-
-static int btrfsic_process_written_superblock(
-		struct btrfsic_state *state,
-		struct btrfsic_block *const superblock,
-		struct btrfs_super_block *const super_hdr)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	int pass;
-
-	superblock->generation = btrfs_super_generation(super_hdr);
-	if (!(superblock->generation > state->max_superblock_generation ||
-	      0 == state->max_superblock_generation)) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			pr_info(
-	"btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
-			       superblock->logical_bytenr,
-			       superblock->dev_state->bdev,
-			       superblock->dev_bytenr, superblock->mirror_num,
-			       btrfs_super_generation(super_hdr),
-			       state->max_superblock_generation);
-	} else {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-			pr_info(
-	"btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
-			       superblock->logical_bytenr,
-			       superblock->dev_state->bdev,
-			       superblock->dev_bytenr, superblock->mirror_num,
-			       btrfs_super_generation(super_hdr),
-			       state->max_superblock_generation);
-
-		state->max_superblock_generation =
-		    btrfs_super_generation(super_hdr);
-		state->latest_superblock = superblock;
-	}
-
-	for (pass = 0; pass < 3; pass++) {
-		int ret;
-		u64 next_bytenr;
-		struct btrfsic_block *next_block;
-		struct btrfsic_block_data_ctx tmp_next_block_ctx;
-		struct btrfsic_block_link *l;
-		int num_copies;
-		int mirror_num;
-		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key = {0};
-
-		btrfs_set_disk_key_objectid(&tmp_disk_key,
-					    BTRFS_ROOT_ITEM_KEY);
-		btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
-
-		switch (pass) {
-		case 0:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_ROOT_TREE_OBJECTID);
-			additional_string = "root ";
-			next_bytenr = btrfs_super_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("root@%llu\n", next_bytenr);
-			break;
-		case 1:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_CHUNK_TREE_OBJECTID);
-			additional_string = "chunk ";
-			next_bytenr = btrfs_super_chunk_root(super_hdr);
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("chunk@%llu\n", next_bytenr);
-			break;
-		case 2:
-			btrfs_set_disk_key_objectid(&tmp_disk_key,
-						    BTRFS_TREE_LOG_OBJECTID);
-			additional_string = "log ";
-			next_bytenr = btrfs_super_log_root(super_hdr);
-			if (0 == next_bytenr)
-				continue;
-			if (state->print_mask &
-			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				pr_info("log@%llu\n", next_bytenr);
-			break;
-		}
-
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
-					      BTRFS_SUPER_INFO_SIZE);
-		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
-			pr_info("num_copies(log_bytenr=%llu) = %d\n",
-			       next_bytenr, num_copies);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			int was_created;
-
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
-			ret = btrfsic_map_block(state, next_bytenr,
-						BTRFS_SUPER_INFO_SIZE,
-						&tmp_next_block_ctx,
-						mirror_num);
-			if (ret) {
-				pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-				       next_bytenr, mirror_num);
-				return -1;
-			}
-
-			next_block = btrfsic_block_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					additional_string,
-					1, 0, 1,
-					mirror_num,
-					&was_created);
-			if (NULL == next_block) {
-				btrfsic_release_block_ctx(&tmp_next_block_ctx);
-				return -1;
-			}
-
-			next_block->disk_key = tmp_disk_key;
-			if (was_created)
-				next_block->generation =
-				    BTRFSIC_GENERATION_UNKNOWN;
-			l = btrfsic_block_link_lookup_or_add(
-					state,
-					&tmp_next_block_ctx,
-					next_block,
-					superblock,
-					BTRFSIC_GENERATION_UNKNOWN);
-			btrfsic_release_block_ctx(&tmp_next_block_ctx);
-			if (NULL == l)
-				return -1;
-		}
-	}
-
-	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
-		btrfsic_dump_tree(state);
-
-	return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
-					struct btrfsic_block *const block,
-					int recursion_level)
-{
-	const struct btrfsic_block_link *l;
-	int ret = 0;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/*
-		 * Note that this situation can happen and does not
-		 * indicate an error in regular cases. It happens
-		 * when disk blocks are freed and later reused.
-		 * The check-integrity module is not aware of any
-		 * block free operations, it just recognizes block
-		 * write operations. Therefore it keeps the linkage
-		 * information for a block until a block is
-		 * rewritten. This can temporarily cause incorrect
-		 * and even circular linkage information. This
-		 * causes no harm unless such blocks are referenced
-		 * by the most recent super block.
-		 */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("btrfsic: abort cyclic linkage (case 1).\n");
-
-		return ret;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack
-	 * space is very small and the max recursion depth is limited.
-	 */
-	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info(
-		"rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-		if (l->block_ref_to->never_written) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (!l->block_ref_to->is_iodone) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (l->block_ref_to->iodone_w_error) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num);
-			ret = -1;
-		} else if (l->parent_generation !=
-			   l->block_ref_to->generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->parent_generation &&
-			   BTRFSIC_GENERATION_UNKNOWN !=
-			   l->block_ref_to->generation) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num,
-			       l->block_ref_to->generation,
-			       l->parent_generation);
-			ret = -1;
-		} else if (l->block_ref_to->flush_gen >
-			   l->block_ref_to->dev_state->last_flush_gen) {
-			pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
-			       btrfsic_get_block_type(state, l->block_ref_to),
-			       l->block_ref_to->logical_bytenr,
-			       l->block_ref_to->dev_state->bdev,
-			       l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num, block->flush_gen,
-			       l->block_ref_to->dev_state->last_flush_gen);
-			ret = -1;
-		} else if (-1 == btrfsic_check_all_ref_blocks(state,
-							      l->block_ref_to,
-							      recursion_level +
-							      1)) {
-			ret = -1;
-		}
-	}
-
-	return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
-		const struct btrfsic_state *state,
-		const struct btrfsic_block *block,
-		int recursion_level)
-{
-	const struct btrfsic_block_link *l;
-
-	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
-		/* refer to comment at "abort cyclic linkage (case 1)" */
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("btrfsic: abort cyclic linkage (case 2).\n");
-
-		return 0;
-	}
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info(
-	"rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
-			       recursion_level,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, block->dev_state->bdev,
-			       block->dev_bytenr, block->mirror_num,
-			       l->ref_cnt,
-			       btrfsic_get_block_type(state, l->block_ref_from),
-			       l->block_ref_from->logical_bytenr,
-			       l->block_ref_from->dev_state->bdev,
-			       l->block_ref_from->dev_bytenr,
-			       l->block_ref_from->mirror_num);
-		if (l->block_ref_from->is_superblock &&
-		    state->latest_superblock->dev_bytenr ==
-		    l->block_ref_from->dev_bytenr &&
-		    state->latest_superblock->dev_state->bdev ==
-		    l->block_ref_from->dev_state->bdev)
-			return 1;
-		else if (btrfsic_is_block_ref_by_superblock(state,
-							    l->block_ref_from,
-							    recursion_level +
-							    1))
-			return 1;
-	}
-
-	return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->bdev,
-	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
-				   const struct btrfsic_block_link *l)
-{
-	pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
-	       l->ref_cnt,
-	       btrfsic_get_block_type(state, l->block_ref_from),
-	       l->block_ref_from->logical_bytenr,
-	       l->block_ref_from->dev_state->bdev,
-	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
-	       btrfsic_get_block_type(state, l->block_ref_to),
-	       l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
-	       l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
-				   const struct btrfsic_block *block)
-{
-	if (block->is_superblock &&
-	    state->latest_superblock->dev_bytenr == block->dev_bytenr &&
-	    state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
-		return 'S';
-	else if (block->is_superblock)
-		return 's';
-	else if (block->is_metadata)
-		return 'M';
-	else
-		return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
-	btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
-				  const struct btrfsic_block *block,
-				  int indent_level)
-{
-	const struct btrfsic_block_link *l;
-	int indent_add;
-	static char buf[80];
-	int cursor_position;
-
-	/*
-	 * Should better fill an on-stack buffer with a complete line and
-	 * dump it at once when it is time to print a newline character.
-	 */
-
-	/*
-	 * This algorithm is recursive because the amount of used stack space
-	 * is very small and the max recursion depth is limited.
-	 */
-	indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
-			     btrfsic_get_block_type(state, block),
-			     block->logical_bytenr, block->dev_state->bdev,
-			     block->dev_bytenr, block->mirror_num);
-	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-		printk("[...]\n");
-		return;
-	}
-	printk(buf);
-	indent_level += indent_add;
-	if (list_empty(&block->ref_to_list)) {
-		printk("\n");
-		return;
-	}
-	if (block->mirror_num > 1 &&
-	    !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
-		printk(" [...]\n");
-		return;
-	}
-
-	cursor_position = indent_level;
-	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
-		while (cursor_position < indent_level) {
-			printk(" ");
-			cursor_position++;
-		}
-		if (l->ref_cnt > 1)
-			indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
-		else
-			indent_add = sprintf(buf, " --> ");
-		if (indent_level + indent_add >
-		    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
-			printk("[...]\n");
-			cursor_position = 0;
-			continue;
-		}
-
-		printk(buf);
-
-		btrfsic_dump_tree_sub(state, l->block_ref_to,
-				      indent_level + indent_add);
-		cursor_position = 0;
-	}
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *next_block_ctx,
-		struct btrfsic_block *next_block,
-		struct btrfsic_block *from_block,
-		u64 parent_generation)
-{
-	struct btrfsic_block_link *l;
-
-	l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
-						next_block_ctx->dev_bytenr,
-						from_block->dev_state->bdev,
-						from_block->dev_bytenr,
-						&state->block_link_hashtable);
-	if (NULL == l) {
-		l = btrfsic_block_link_alloc();
-		if (!l)
-			return NULL;
-
-		l->block_ref_to = next_block;
-		l->block_ref_from = from_block;
-		l->ref_cnt = 1;
-		l->parent_generation = parent_generation;
-
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-
-		list_add(&l->node_ref_to, &from_block->ref_to_list);
-		list_add(&l->node_ref_from, &next_block->ref_from_list);
-
-		btrfsic_block_link_hashtable_add(l,
-						 &state->block_link_hashtable);
-	} else {
-		l->ref_cnt++;
-		l->parent_generation = parent_generation;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			btrfsic_print_add_link(state, l);
-	}
-
-	return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
-		struct btrfsic_state *state,
-		struct btrfsic_block_data_ctx *block_ctx,
-		const char *additional_string,
-		int is_metadata,
-		int is_iodone,
-		int never_written,
-		int mirror_num,
-		int *was_created)
-{
-	struct btrfsic_block *block;
-
-	block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
-					       block_ctx->dev_bytenr,
-					       &state->block_hashtable);
-	if (NULL == block) {
-		struct btrfsic_dev_state *dev_state;
-
-		block = btrfsic_block_alloc();
-		if (!block)
-			return NULL;
-
-		dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
-		if (NULL == dev_state) {
-			pr_info("btrfsic: error, lookup dev_state failed!\n");
-			btrfsic_block_free(block);
-			return NULL;
-		}
-		block->dev_state = dev_state;
-		block->dev_bytenr = block_ctx->dev_bytenr;
-		block->logical_bytenr = block_ctx->start;
-		block->is_metadata = is_metadata;
-		block->is_iodone = is_iodone;
-		block->never_written = never_written;
-		block->mirror_num = mirror_num;
-		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-			pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
-			       additional_string,
-			       btrfsic_get_block_type(state, block),
-			       block->logical_bytenr, dev_state->bdev,
-			       block->dev_bytenr, mirror_num);
-		list_add(&block->all_blocks_node, &state->all_blocks_list);
-		btrfsic_block_hashtable_add(block, &state->block_hashtable);
-		if (NULL != was_created)
-			*was_created = 1;
-	} else {
-		if (NULL != was_created)
-			*was_created = 0;
-	}
-
-	return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
-					   u64 bytenr,
-					   struct btrfsic_dev_state *dev_state,
-					   u64 dev_bytenr)
-{
-	struct btrfs_fs_info *fs_info = state->fs_info;
-	struct btrfsic_block_data_ctx block_ctx;
-	int num_copies;
-	int mirror_num;
-	int match = 0;
-	int ret;
-
-	num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
-
-	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-		ret = btrfsic_map_block(state, bytenr, state->metablock_size,
-					&block_ctx, mirror_num);
-		if (ret) {
-			pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
-			       bytenr, mirror_num);
-			continue;
-		}
-
-		if (dev_state->bdev == block_ctx.dev->bdev &&
-		    dev_bytenr == block_ctx.dev_bytenr) {
-			match++;
-			btrfsic_release_block_ctx(&block_ctx);
-			break;
-		}
-		btrfsic_release_block_ctx(&block_ctx);
-	}
-
-	if (WARN_ON(!match)) {
-		pr_info(
-"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
-		       bytenr, dev_state->bdev, dev_bytenr);
-		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-			ret = btrfsic_map_block(state, bytenr,
-						state->metablock_size,
-						&block_ctx, mirror_num);
-			if (ret)
-				continue;
-
-			pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
-			       bytenr, block_ctx.dev->bdev,
-			       block_ctx.dev_bytenr, mirror_num);
-		}
-	}
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
-{
-	return btrfsic_dev_state_hashtable_lookup(dev,
-						  &btrfsic_dev_state_hashtable);
-}
-
-static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
-	unsigned int segs = bio_segments(bio);
-	u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
-	u64 cur_bytenr = dev_bytenr;
-	struct bvec_iter iter;
-	struct bio_vec bvec;
-	char **mapped_datav;
-	int bio_is_patched = 0;
-	int i = 0;
-
-	if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-		pr_info(
-"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
-		       bio_op(bio), bio->bi_opf, segs,
-		       bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
-	mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
-	if (!mapped_datav)
-		return;
-
-	bio_for_each_segment(bvec, bio, iter) {
-		BUG_ON(bvec.bv_len != PAGE_SIZE);
-		mapped_datav[i] = page_address(bvec.bv_page);
-		i++;
-
-		if (dev_state->state->print_mask &
-		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
-			pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
-			       i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
-		cur_bytenr += bvec.bv_len;
-	}
-
-	btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
-				      bio, &bio_is_patched, bio->bi_opf);
-	kfree(mapped_datav);
-}
-
-static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
-	if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-		pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
-		       bio_op(bio), bio->bi_opf, bio->bi_bdev);
-
-	if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
-		struct btrfsic_block *const block =
-			&dev_state->dummy_block_for_bio_bh_flush;
-
-		block->is_iodone = 0;
-		block->never_written = 0;
-		block->iodone_w_error = 0;
-		block->flush_gen = dev_state->last_flush_gen + 1;
-		block->submit_bio_bh_rw = bio->bi_opf;
-		block->orig_bio_private = bio->bi_private;
-		block->orig_bio_end_io = bio->bi_end_io;
-		block->next_in_same_bio = NULL;
-		bio->bi_private = block;
-		bio->bi_end_io = btrfsic_bio_end_io;
-	} else if ((dev_state->state->print_mask &
-		   (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-		    BTRFSIC_PRINT_MASK_VERBOSE))) {
-		pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
-		       dev_state->bdev);
-	}
-}
-
-void btrfsic_check_bio(struct bio *bio)
-{
-	struct btrfsic_dev_state *dev_state;
-
-	if (!btrfsic_is_initialized)
-		return;
-
-	/*
-	 * We can be called before btrfsic_mount, so there might not be a
-	 * dev_state.
-	 */
-	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
-	mutex_lock(&btrfsic_mutex);
-	if (dev_state) {
-		if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
-			btrfsic_check_write_bio(bio, dev_state);
-		else if (bio->bi_opf & REQ_PREFLUSH)
-			btrfsic_check_flush_bio(bio, dev_state);
-	}
-	mutex_unlock(&btrfsic_mutex);
-}
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask)
-{
-	int ret;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	if (!PAGE_ALIGNED(fs_info->nodesize)) {
-		pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
-		       fs_info->nodesize, PAGE_SIZE);
-		return -1;
-	}
-	if (!PAGE_ALIGNED(fs_info->sectorsize)) {
-		pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
-		       fs_info->sectorsize, PAGE_SIZE);
-		return -1;
-	}
-	state = kvzalloc(sizeof(*state), GFP_KERNEL);
-	if (!state)
-		return -ENOMEM;
-
-	if (!btrfsic_is_initialized) {
-		mutex_init(&btrfsic_mutex);
-		btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
-		btrfsic_is_initialized = 1;
-	}
-	mutex_lock(&btrfsic_mutex);
-	state->fs_info = fs_info;
-	state->print_mask = print_mask;
-	state->include_extent_data = including_extent_data;
-	state->metablock_size = fs_info->nodesize;
-	state->datablock_size = fs_info->sectorsize;
-	INIT_LIST_HEAD(&state->all_blocks_list);
-	btrfsic_block_hashtable_init(&state->block_hashtable);
-	btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
-	state->max_superblock_generation = 0;
-	state->latest_superblock = NULL;
-
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_alloc();
-		if (NULL == ds) {
-			mutex_unlock(&btrfsic_mutex);
-			return -ENOMEM;
-		}
-		ds->bdev = device->bdev;
-		ds->state = state;
-		btrfsic_dev_state_hashtable_add(ds,
-						&btrfsic_dev_state_hashtable);
-	}
-
-	ret = btrfsic_process_superblock(state, fs_devices);
-	if (0 != ret) {
-		mutex_unlock(&btrfsic_mutex);
-		btrfsic_unmount(fs_devices);
-		return ret;
-	}
-
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
-		btrfsic_dump_database(state);
-	if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
-		btrfsic_dump_tree(state);
-
-	mutex_unlock(&btrfsic_mutex);
-	return 0;
-}
-
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
-{
-	struct btrfsic_block *b_all, *tmp_all;
-	struct btrfsic_state *state;
-	struct list_head *dev_head = &fs_devices->devices;
-	struct btrfs_device *device;
-
-	if (!btrfsic_is_initialized)
-		return;
-
-	mutex_lock(&btrfsic_mutex);
-
-	state = NULL;
-	list_for_each_entry(device, dev_head, dev_list) {
-		struct btrfsic_dev_state *ds;
-
-		if (!device->bdev || !device->name)
-			continue;
-
-		ds = btrfsic_dev_state_hashtable_lookup(
-				device->bdev->bd_dev,
-				&btrfsic_dev_state_hashtable);
-		if (NULL != ds) {
-			state = ds->state;
-			btrfsic_dev_state_hashtable_remove(ds);
-			btrfsic_dev_state_free(ds);
-		}
-	}
-
-	if (NULL == state) {
-		pr_info("btrfsic: error, cannot find state information on umount!\n");
-		mutex_unlock(&btrfsic_mutex);
-		return;
-	}
-
-	/*
-	 * Don't care about keeping the lists' state up to date,
-	 * just free all memory that was allocated dynamically.
-	 * Free the blocks and the block_links.
-	 */
-	list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
-				 all_blocks_node) {
-		struct btrfsic_block_link *l, *tmp;
-
-		list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
-					 node_ref_to) {
-			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-				btrfsic_print_rem_link(state, l);
-
-			l->ref_cnt--;
-			if (0 == l->ref_cnt)
-				btrfsic_block_link_free(l);
-		}
-
-		if (b_all->is_iodone || b_all->never_written)
-			btrfsic_block_free(b_all);
-		else
-			pr_info(
-"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, b_all),
-			       b_all->logical_bytenr, b_all->dev_state->bdev,
-			       b_all->dev_bytenr, b_all->mirror_num);
-	}
-
-	mutex_unlock(&btrfsic_mutex);
-
-	kvfree(state);
-}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
deleted file mode 100644
index e4c8aed7996f..000000000000
--- a/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) STRATO AG 2011.  All rights reserved.
- */
-
-#ifndef BTRFS_CHECK_INTEGRITY_H
-#define BTRFS_CHECK_INTEGRITY_H
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_check_bio(struct bio *bio);
-#else
-static inline void btrfsic_check_bio(struct bio *bio) { }
-#endif
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
-		  struct btrfs_fs_devices *fs_devices,
-		  int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8818ed5c390f..19b22b4653c8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
 	unsigned long index = cb->start >> PAGE_SHIFT;
 	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
-	const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+	const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
 	int i;
 	int ret;
 
-	if (errno)
-		mapping_set_error(inode->i_mapping, errno);
+	if (error)
+		mapping_set_error(inode->i_mapping, error);
 
 	folio_batch_init(&fbatch);
 	while (index <= end_index) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 617d4827eec2..35c1d24d4a78 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p)
  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
  * caused by external factors.
  */
-bool __cold abort_should_print_stack(int errno)
+bool __cold abort_should_print_stack(int error)
 {
-	switch (errno) {
+	switch (error) {
 	case -EIO:
 	case -EROFS:
 	case -ENOMEM:
@@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int level;
 	struct btrfs_disk_key disk_key;
+	u64 reloc_src_root = 0;
 
 	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 		trans->transid != fs_info->running_transaction->transid);
@@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(buf, &disk_key, 0);
 
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		reloc_src_root = btrfs_header_owner(buf);
 	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 				     &disk_key, level, buf->start, 0,
-				     BTRFS_NESTING_NEW_ROOT);
+				     reloc_src_root, BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	btrfs_mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(trans, cow);
 	*cow_ret = cow;
 	return 0;
 }
@@ -429,7 +432,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 	if (btrfs_block_can_be_shared(trans, root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 					       btrfs_header_level(buf), 1,
-					       &refs, &flags);
+					       &refs, &flags, NULL);
 		if (ret)
 			return ret;
 		if (unlikely(refs == 0)) {
@@ -518,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
  * bytes the allocator should try to find free next to the block it returns.
  * This is just a hint and may be ignored by the allocator.
  */
-static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
-			     struct extent_buffer *buf,
-			     struct extent_buffer *parent, int parent_slot,
-			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size,
-			     enum btrfs_lock_nesting nest)
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct extent_buffer *buf,
+			  struct extent_buffer *parent, int parent_slot,
+			  struct extent_buffer **cow_ret,
+			  u64 search_start, u64 empty_size,
+			  enum btrfs_lock_nesting nest)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_disk_key disk_key;
@@ -533,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int last_ref = 0;
 	int unlock_orig = 0;
 	u64 parent_start = 0;
+	u64 reloc_src_root = 0;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
@@ -551,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(buf, &disk_key, 0);
 
-	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
-		parent_start = parent->start;
-
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		reloc_src_root = btrfs_header_owner(buf);
+	}
 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
 				     root->root_key.objectid, &disk_key, level,
-				     search_start, empty_size, nest);
+				     search_start, empty_size, reloc_src_root, nest);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -627,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 		if (last_ref) {
 			ret = btrfs_tree_mod_log_free_eb(buf);
 			if (ret) {
@@ -643,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
 	free_extent_buffer_stale(buf);
-	btrfs_mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(trans, cow);
 	*cow_ret = cow;
 	return 0;
 }
@@ -679,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
 }
 
 /*
- * cows a single block, see __btrfs_cow_block for the real work.
+ * COWs a single block, see btrfs_force_cow_block() for the real work.
  * This version of it has extra checks so that a block isn't COWed more than
  * once per transaction, as long as it hasn't been written yet
  */
-noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret,
@@ -723,7 +729,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	search_start = buf->start & ~((u64)SZ_1G - 1);
+	search_start = round_down(buf->start, SZ_1G);
 
 	/*
 	 * Before CoWing this block for later modification, check if it's
@@ -732,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	 * Also We don't care about the error, as it's handled internally.
 	 */
 	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
-	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0, nest);
+	ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+				    cow_ret, search_start, 0, nest);
 
 	trace_btrfs_cow_block(root, buf, *cow_ret);
 
@@ -742,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
 
 /*
- * helper function for defrag to decide if two blocks pointed to by a
- * node are actually close by
- */
-static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
-{
-	if (blocknr < other && other - (blocknr + blocksize) < 32768)
-		return 1;
-	if (blocknr > other && blocknr - (other + blocksize) < 32768)
-		return 1;
-	return 0;
-}
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Compare two keys, on little-endian the disk order is same as CPU order and
- * we can avoid the conversion.
- */
-static int comp_keys(const struct btrfs_disk_key *disk_key,
-		     const struct btrfs_key *k2)
-{
-	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
-
-	return btrfs_comp_cpu_keys(k1, k2);
-}
-
-#else
-
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(const struct btrfs_disk_key *disk,
-		     const struct btrfs_key *k2)
-{
-	struct btrfs_key k1;
-
-	btrfs_disk_key_to_cpu(&k1, disk);
-
-	return btrfs_comp_cpu_keys(&k1, k2);
-}
-#endif
-
-/*
  * same as comp_keys only with two btrfs_key's
  */
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
@@ -805,105 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
 }
 
 /*
- * this is used by the defrag code to go through all the
- * leaves pointed to by a node and reallocate them so that
- * disk order is close to key order
- */
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, u64 *last_ret,
-		       struct btrfs_key *progress)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct extent_buffer *cur;
-	u64 blocknr;
-	u64 search_start = *last_ret;
-	u64 last_block = 0;
-	u64 other;
-	u32 parent_nritems;
-	int end_slot;
-	int i;
-	int err = 0;
-	u32 blocksize;
-	int progress_passed = 0;
-	struct btrfs_disk_key disk_key;
-
-	/*
-	 * COWing must happen through a running transaction, which always
-	 * matches the current fs generation (it's a transaction with a state
-	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
-	 * into error state to prevent the commit of any transaction.
-	 */
-	if (unlikely(trans->transaction != fs_info->running_transaction ||
-		     trans->transid != fs_info->generation)) {
-		btrfs_abort_transaction(trans, -EUCLEAN);
-		btrfs_crit(fs_info,
-"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
-			   parent->start, btrfs_root_id(root), trans->transid,
-			   fs_info->running_transaction->transid,
-			   fs_info->generation);
-		return -EUCLEAN;
-	}
-
-	parent_nritems = btrfs_header_nritems(parent);
-	blocksize = fs_info->nodesize;
-	end_slot = parent_nritems - 1;
-
-	if (parent_nritems <= 1)
-		return 0;
-
-	for (i = start_slot; i <= end_slot; i++) {
-		int close = 1;
-
-		btrfs_node_key(parent, &disk_key, i);
-		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
-			continue;
-
-		progress_passed = 1;
-		blocknr = btrfs_node_blockptr(parent, i);
-		if (last_block == 0)
-			last_block = blocknr;
-
-		if (i > 0) {
-			other = btrfs_node_blockptr(parent, i - 1);
-			close = close_blocks(blocknr, other, blocksize);
-		}
-		if (!close && i < end_slot) {
-			other = btrfs_node_blockptr(parent, i + 1);
-			close = close_blocks(blocknr, other, blocksize);
-		}
-		if (close) {
-			last_block = blocknr;
-			continue;
-		}
-
-		cur = btrfs_read_node_slot(parent, i);
-		if (IS_ERR(cur))
-			return PTR_ERR(cur);
-		if (search_start == 0)
-			search_start = last_block;
-
-		btrfs_tree_lock(cur);
-		err = __btrfs_cow_block(trans, root, cur, parent, i,
-					&cur, search_start,
-					min(16 * blocksize,
-					    (end_slot - i) * blocksize),
-					BTRFS_NESTING_COW);
-		if (err) {
-			btrfs_tree_unlock(cur);
-			free_extent_buffer(cur);
-			break;
-		}
-		search_start = cur->start;
-		last_block = cur->start;
-		*last_ret = search_start;
-		btrfs_tree_unlock(cur);
-		free_extent_buffer(cur);
-	}
-	return err;
-}
-
-/*
  * Search for a key in the given extent_buffer.
  *
  * The lower boundary for the search is specified by the slot number @first_slot.
@@ -969,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 			tmp = &unaligned;
 		}
 
-		ret = comp_keys(tmp, key);
+		ret = btrfs_comp_keys(tmp, key);
 
 		if (ret < 0)
 			low = mid + 1;
@@ -984,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 	return 1;
 }
 
-static void root_add_used(struct btrfs_root *root, u32 size)
+static void root_add_used_bytes(struct btrfs_root *root)
 {
 	spin_lock(&root->accounting_lock);
 	btrfs_set_root_used(&root->root_item,
-			    btrfs_root_used(&root->root_item) + size);
+		btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
 	spin_unlock(&root->accounting_lock);
 }
 
-static void root_sub_used(struct btrfs_root *root, u32 size)
+static void root_sub_used_bytes(struct btrfs_root *root)
 {
 	spin_lock(&root->accounting_lock);
 	btrfs_set_root_used(&root->root_item,
-			    btrfs_root_used(&root->root_item) - size);
+		btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
 	spin_unlock(&root->accounting_lock);
 }
 
@@ -1112,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 
-		root_sub_used(root, mid->len);
+		root_sub_used_bytes(root);
 		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer_stale(mid);
@@ -1182,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 				right = NULL;
 				goto out;
 			}
-			root_sub_used(root, right->len);
+			root_sub_used_bytes(root);
 			btrfs_free_tree_block(trans, btrfs_root_id(root), right,
 					      0, 1);
 			free_extent_buffer_stale(right);
@@ -1197,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 		}
 	}
 	if (btrfs_header_nritems(mid) == 1) {
@@ -1240,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			mid = NULL;
 			goto out;
 		}
-		root_sub_used(root, mid->len);
+		root_sub_used_bytes(root);
 		btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
 		free_extent_buffer_stale(mid);
 		mid = NULL;
@@ -1255,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 		btrfs_set_node_key(parent, &mid_key, pslot);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 	}
 
 	/* update the path */
@@ -1362,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 			btrfs_set_node_key(parent, &disk_key, pslot);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
 				path->nodes[level] = left;
 				path->slots[level + 1] -= 1;
@@ -1422,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				return ret;
 			}
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
-			btrfs_mark_buffer_dirty(parent);
+			btrfs_mark_buffer_dirty(trans, parent);
 
 			if (btrfs_header_nritems(mid) <= orig_slot) {
 				path->nodes[level] = right;
@@ -2006,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans,
 			 * the extent buffer's header and we have recently accessed
 			 * the header's level field.
 			 */
-			ret = comp_keys(&first_key, key);
+			ret = btrfs_comp_keys(&first_key, key);
 			if (ret < 0) {
 				/*
 				 * The first key is smaller than the key we want
@@ -2091,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans,
 }
 
 /*
- * btrfs_search_slot - look for a key in a tree and perform necessary
- * modifications to preserve tree invariants.
+ * Look for a key in a tree and perform necessary modifications to preserve
+ * tree invariants.
  *
  * @trans:	Handle of transaction, used when modifying the tree
  * @p:		Holds all btree nodes along the search path
@@ -2515,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	 */
 	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
 		btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
-		ret = comp_keys(&found_key, &orig_key);
+		ret = btrfs_comp_keys(&found_key, &orig_key);
 		if (ret == 0) {
 			if (path->slots[0] > 0) {
 				path->slots[0]--;
@@ -2530,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	}
 
 	btrfs_item_key(path->nodes[0], &found_key, 0);
-	ret = comp_keys(&found_key, &key);
+	ret = btrfs_comp_keys(&found_key, &key);
 	/*
 	 * We might have had an item with the previous key in the tree right
 	 * before we released our path. And after we released our path, that
@@ -2678,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
  * higher levels
  *
  */
-static void fixup_low_keys(struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+			   struct btrfs_path *path,
 			   struct btrfs_disk_key *key, int level)
 {
 	int i;
@@ -2695,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path,
 						    BTRFS_MOD_LOG_KEY_REPLACE);
 		BUG_ON(ret < 0);
 		btrfs_set_node_key(t, key, tslot);
-		btrfs_mark_buffer_dirty(path->nodes[i]);
+		btrfs_mark_buffer_dirty(trans, path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -2707,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path,
 			     const struct btrfs_key *new_key)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *eb;
 	int slot;
@@ -2719,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 	slot = path->slots[0];
 	if (slot > 0) {
 		btrfs_item_key(eb, &disk_key, slot - 1);
-		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+		if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
 			btrfs_print_leaf(eb);
 			btrfs_crit(fs_info,
 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2733,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 	}
 	if (slot < btrfs_header_nritems(eb) - 1) {
 		btrfs_item_key(eb, &disk_key, slot + 1);
-		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+		if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
 			btrfs_print_leaf(eb);
 			btrfs_crit(fs_info,
 		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2748,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(eb, &disk_key, slot);
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 	if (slot == 0)
-		fixup_low_keys(path, &disk_key, 1);
+		fixup_low_keys(trans, path, &disk_key, 1);
 }
 
 /*
@@ -2881,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	}
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
-	btrfs_mark_buffer_dirty(src);
-	btrfs_mark_buffer_dirty(dst);
+	btrfs_mark_buffer_dirty(trans, src);
+	btrfs_mark_buffer_dirty(trans, dst);
 
 	return ret;
 }
@@ -2957,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
 
-	btrfs_mark_buffer_dirty(src);
-	btrfs_mark_buffer_dirty(dst);
+	btrfs_mark_buffer_dirty(trans, src);
+	btrfs_mark_buffer_dirty(trans, dst);
 
 	return ret;
 }
@@ -2974,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
@@ -2993,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
 				   &lower_key, level, root->node->start, 0,
-				   BTRFS_NESTING_NEW_ROOT);
+				   0, BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_node_key(c, &lower_key, 0);
@@ -3007,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
 
-	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(trans, c);
 
 	old = root->node;
 	ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
@@ -3079,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
 	WARN_ON(trans->transid == 0);
 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
 	btrfs_set_header_nritems(lower, nritems + 1);
-	btrfs_mark_buffer_dirty(lower);
+	btrfs_mark_buffer_dirty(trans, lower);
 
 	return 0;
 }
@@ -3137,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
 	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
 				       &disk_key, level, c->start, 0,
-				       BTRFS_NESTING_SPLIT);
+				       0, BTRFS_NESTING_SPLIT);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 	ASSERT(btrfs_header_level(c) == level);
 
 	ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
@@ -3158,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(split, c_nritems - mid);
 	btrfs_set_header_nritems(c, mid);
 
-	btrfs_mark_buffer_dirty(c);
-	btrfs_mark_buffer_dirty(split);
+	btrfs_mark_buffer_dirty(trans, c);
+	btrfs_mark_buffer_dirty(trans, split);
 
 	ret = insert_ptr(trans, path, &disk_key, split->start,
 			 path->slots[level + 1] + 1, level + 1);
@@ -3325,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(left, left_nritems);
 
 	if (left_nritems)
-		btrfs_mark_buffer_dirty(left);
+		btrfs_mark_buffer_dirty(trans, left);
 	else
 		btrfs_clear_buffer_dirty(trans, left);
 
-	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(trans, right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
-	btrfs_mark_buffer_dirty(upper);
+	btrfs_mark_buffer_dirty(trans, upper);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
@@ -3545,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		btrfs_set_token_item_offset(&token, i, push_space);
 	}
 
-	btrfs_mark_buffer_dirty(left);
+	btrfs_mark_buffer_dirty(trans, left);
 	if (right_nritems)
-		btrfs_mark_buffer_dirty(right);
+		btrfs_mark_buffer_dirty(trans, right);
 	else
 		btrfs_clear_buffer_dirty(trans, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	fixup_low_keys(path, &disk_key, 1);
+	fixup_low_keys(trans, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -3683,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		return ret;
 
-	btrfs_mark_buffer_dirty(right);
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, right);
+	btrfs_mark_buffer_dirty(trans, l);
 	BUG_ON(path->slots[0] != slot);
 
 	if (mid <= slot) {
@@ -3888,13 +3753,13 @@ again:
 	 * use BTRFS_NESTING_NEW_ROOT.
 	 */
 	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-				       &disk_key, 0, l->start, 0,
+				       &disk_key, 0, l->start, 0, 0,
 				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
 				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
-	root_add_used(root, fs_info->nodesize);
+	root_add_used_bytes(root);
 
 	if (split == 0) {
 		if (mid <= slot) {
@@ -3925,7 +3790,7 @@ again:
 			path->nodes[0] = right;
 			path->slots[0] = 0;
 			if (path->slots[1] == 0)
-				fixup_low_keys(path, &disk_key, 1);
+				fixup_low_keys(trans, path, &disk_key, 1);
 		}
 		/*
 		 * We create a new leaf 'right' for the required ins_len and
@@ -4024,7 +3889,8 @@ err:
 	return ret;
 }
 
-static noinline int split_item(struct btrfs_path *path,
+static noinline int split_item(struct btrfs_trans_handle *trans,
+			       struct btrfs_path *path,
 			       const struct btrfs_key *new_key,
 			       unsigned long split_offset)
 {
@@ -4083,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path,
 	write_extent_buffer(leaf, buf + split_offset,
 			    btrfs_item_ptr_offset(leaf, slot),
 			    item_size - split_offset);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
 	kfree(buf);
@@ -4117,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
-	ret = split_item(path, new_key, split_offset);
+	ret = split_item(trans, path, new_key, split_offset);
 	return ret;
 }
 
@@ -4127,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_path *path, u32 new_size, int from_end)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4203,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(path, &disk_key, 1);
+			fixup_low_keys(trans, path, &disk_key, 1);
 	}
 
 	btrfs_set_item_size(leaf, slot, new_size);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4218,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
 /*
  * make the item pointed to by the path bigger, data_size is the added size.
  */
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+		       struct btrfs_path *path, u32 data_size)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -4268,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 	data_end = old_data;
 	old_size = btrfs_item_size(leaf, slot);
 	btrfs_set_item_size(leaf, slot, old_size + data_size);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4279,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
 /*
  * Make space in the node before inserting one or more items.
  *
+ * @trans:	transaction handle
  * @root:	root we are inserting items to
  * @path:	points to the leaf/slot where we are going to insert new items
  * @batch:      information about the batch of items to insert
@@ -4286,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
  * Main purpose is to save stack depth by doing the bulk of the work in a
  * function that doesn't call btrfs_search_slot
  */
-static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+static void setup_items_for_insert(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root, struct btrfs_path *path,
 				   const struct btrfs_item_batch *batch)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4306,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 	 */
 	if (path->slots[0] == 0) {
 		btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
-		fixup_low_keys(path, &disk_key, 1);
+		fixup_low_keys(trans, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 
@@ -4365,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + batch->nr);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (btrfs_leaf_free_space(leaf) < 0) {
 		btrfs_print_leaf(leaf);
@@ -4376,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
 /*
  * Insert a new item into a leaf.
  *
+ * @trans:     Transaction handle.
  * @root:      The root of the btree.
  * @path:      A path pointing to the target leaf and slot.
  * @key:       The key of the new item.
  * @data_size: The size of the data associated with the new key.
  */
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 const struct btrfs_key *key,
 				 u32 data_size)
@@ -4393,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root,
 	batch.total_data_size = data_size;
 	batch.nr = 1;
 
-	setup_items_for_insert(root, path, &batch);
+	setup_items_for_insert(trans, root, path, &batch);
 }
 
 /*
@@ -4419,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	setup_items_for_insert(root, path, batch);
+	setup_items_for_insert(trans, root, path, batch);
 	return 0;
 }
 
@@ -4444,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		leaf = path->nodes[0];
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		write_extent_buffer(leaf, data, ptr, data_size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	}
 	btrfs_free_path(path);
 	return ret;
@@ -4475,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 		return ret;
 
 	path->slots[0]++;
-	btrfs_setup_item_for_insert(root, path, new_key, item_size);
+	btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
 	leaf = path->nodes[0];
 	memcpy_extent_buffer(leaf,
 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4533,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		fixup_low_keys(path, &disk_key, level + 1);
+		fixup_low_keys(trans, path, &disk_key, level + 1);
 	}
-	btrfs_mark_buffer_dirty(parent);
+	btrfs_mark_buffer_dirty(trans, parent);
 	return 0;
 }
 
@@ -4567,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	root_sub_used(root, leaf->len);
+	root_sub_used_bytes(root);
 
 	atomic_inc(&leaf->refs);
 	btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
@@ -4632,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			fixup_low_keys(path, &disk_key, 1);
+			fixup_low_keys(trans, path, &disk_key, 1);
 		}
 
 		/*
@@ -4697,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				 * dirtied this buffer
 				 */
 				if (path->nodes[0] == leaf)
-					btrfs_mark_buffer_dirty(leaf);
+					btrfs_mark_buffer_dirty(trans, leaf);
 				free_extent_buffer(leaf);
 			}
 		} else {
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 	}
 	return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ff40acd63a37..196c005c31f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,37 +6,10 @@
 #ifndef BTRFS_CTREE_H
 #define BTRFS_CTREE_H
 
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/completion.h>
-#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <trace/events/btrfs.h>
-#include <asm/unaligned.h>
 #include <linux/pagemap.h>
-#include <linux/btrfs.h>
-#include <linux/btrfs_tree.h>
-#include <linux/workqueue.h>
-#include <linux/security.h>
-#include <linux/sizes.h>
-#include <linux/dynamic_debug.h>
-#include <linux/refcount.h>
-#include <linux/crc32c.h>
-#include <linux/iomap.h>
-#include <linux/fscrypt.h>
-#include "extent-io-tree.h"
-#include "extent_io.h"
-#include "extent_map.h"
-#include "async-thread.h"
-#include "block-rsv.h"
 #include "locking.h"
-#include "misc.h"
 #include "fs.h"
+#include "accessors.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -218,10 +191,22 @@ struct btrfs_root {
 	atomic_t log_commit[2];
 	/* Used only for log trees of subvolumes, not for the log root tree */
 	atomic_t log_batch;
+	/*
+	 * Protected by the 'log_mutex' lock but can be read without holding
+	 * that lock to avoid unnecessary lock contention, in which case it
+	 * should be read using btrfs_get_root_log_transid() except if it's a
+	 * log tree in which case it can be directly accessed. Updates to this
+	 * field should always use btrfs_set_root_log_transid(), except for log
+	 * trees where the field can be updated directly.
+	 */
 	int log_transid;
 	/* No matter the commit succeeds or not*/
 	int log_transid_committed;
-	/* Just be updated when the commit succeeds. */
+	/*
+	 * Just be updated when the commit succeeds. Use
+	 * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
+	 * to access this field.
+	 */
 	int last_log_commit;
 	pid_t log_start_pid;
 
@@ -326,6 +311,9 @@ struct btrfs_root {
 	/* Used only by log trees, when logging csum items */
 	struct extent_io_tree log_csum_range;
 
+	/* Used in simple quotas, track root during relocation. */
+	u64 relocation_src_root;
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	u64 alloc_bytenr;
 #endif
@@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root)
 	return root->root_key.objectid;
 }
 
+static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
+{
+	return READ_ONCE(root->log_transid);
+}
+
+static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
+{
+	WRITE_ONCE(root->log_transid, log_transid);
+}
+
+static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
+{
+	return READ_ONCE(root->last_log_commit);
+}
+
+static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
+{
+	WRITE_ONCE(root->last_log_commit, commit_id);
+}
+
 /*
  * Structure that conveys information about an extent that is going to replace
  * all the extents in a file range.
@@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
 				((bytes) >> (fs_info)->sectorsize_bits)
 
-static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
-{
-	return crc32c(crc, address, length);
-}
-
-static inline void btrfs_crc32c_final(u32 crc, u8 *result)
-{
-	put_unaligned_le32(~crc, result);
-}
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
-       return crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
-                                   int len)
-{
-       return (u64) crc32c(parent_objectid, name, len);
-}
-
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
 	return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
 		     const struct btrfs_key *key, int *slot);
 
 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
+				  const struct btrfs_key *k2)
+{
+	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+	return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/* Compare two keys in a memcmp fashion. */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
+				  const struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+#endif
+
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
 int btrfs_previous_extent_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path,
 			     const struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -536,6 +550,13 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret,
 		    enum btrfs_lock_nesting nest);
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct extent_buffer *buf,
+			  struct extent_buffer *parent, int parent_slot,
+			  struct extent_buffer **cow_ret,
+			  u64 search_start, u64 empty_size,
+			  enum btrfs_lock_nesting nest);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -545,8 +566,10 @@ int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *buf);
 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct btrfs_path *path, int level, int slot);
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+		       struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_path *path, u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_path *path,
@@ -567,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
 			       const struct btrfs_key *key,
 			       struct btrfs_path *p, int find_higher,
 			       int return_any);
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, u64 *last_ret,
-		       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
@@ -610,7 +629,8 @@ struct btrfs_item_batch {
 	int nr;
 };
 
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 const struct btrfs_key *key,
 				 u32 data_size);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index f2ff4cbe8656..5244561e2016 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Check if two blocks addresses are close, used by defrag.
+ */
+static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+	if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
+		return true;
+	if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
+		return true;
+	return false;
+}
+
+/*
+ * Go through all the leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order.
+ */
+static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *parent,
+			      int start_slot, u64 *last_ret,
+			      struct btrfs_key *progress)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	const u32 blocksize = fs_info->nodesize;
+	const int end_slot = btrfs_header_nritems(parent) - 1;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
+	int ret = 0;
+	bool progress_passed = false;
+
+	/*
+	 * COWing must happen through a running transaction, which always
+	 * matches the current fs generation (it's a transaction with a state
+	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+	 * into error state to prevent the commit of any transaction.
+	 */
+	if (unlikely(trans->transaction != fs_info->running_transaction ||
+		     trans->transid != fs_info->generation)) {
+		btrfs_abort_transaction(trans, -EUCLEAN);
+		btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+			   parent->start, btrfs_root_id(root), trans->transid,
+			   fs_info->running_transaction->transid,
+			   fs_info->generation);
+		return -EUCLEAN;
+	}
+
+	if (btrfs_header_nritems(parent) <= 1)
+		return 0;
+
+	for (int i = start_slot; i <= end_slot; i++) {
+		struct extent_buffer *cur;
+		struct btrfs_disk_key disk_key;
+		u64 blocknr;
+		u64 other;
+		bool close = true;
+
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = true;
+		blocknr = btrfs_node_blockptr(parent, i);
+		if (last_block == 0)
+			last_block = blocknr;
+
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (!close && i < end_slot) {
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (close) {
+			last_block = blocknr;
+			continue;
+		}
+
+		cur = btrfs_read_node_slot(parent, i);
+		if (IS_ERR(cur))
+			return PTR_ERR(cur);
+		if (search_start == 0)
+			search_start = last_block;
+
+		btrfs_tree_lock(cur);
+		ret = btrfs_force_cow_block(trans, root, cur, parent, i,
+					    &cur, search_start,
+					    min(16 * blocksize,
+						(end_slot - i) * blocksize),
+					    BTRFS_NESTING_COW);
+		if (ret) {
+			btrfs_tree_unlock(cur);
+			free_extent_buffer(cur);
+			break;
+		}
+		search_start = cur->start;
+		last_block = cur->start;
+		*last_ret = search_start;
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
+	}
+	return ret;
+}
+
+/*
  * Defrag all the leaves in a given btree.
  * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root)
+static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
 {
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
@@ -461,6 +566,45 @@ done:
 }
 
 /*
+ * Defrag a given btree.  Every leaf in the btree is read and defragmented.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+		return 0;
+
+	while (1) {
+		struct btrfs_trans_handle *trans;
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+
+		ret = btrfs_defrag_leaves(trans, root);
+
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty(fs_info);
+		cond_resched();
+
+		if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
+			break;
+
+		if (btrfs_defrag_cancelled(fs_info)) {
+			btrfs_debug(fs_info, "defrag_root cancelled");
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+	return ret;
+}
+
+/*
  * Defrag specific helper to get an extent map.
  *
  * Differences between this and btrfs_get_extent() are:
@@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
 		 *    very likely resulting in a larger extent after writeback is
 		 *    triggered (except in a case of free space fragmentation).
 		 */
-		if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
-				   EXTENT_DELALLOC, 0, NULL))
+		if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+					  EXTENT_DELALLOC))
 			goto next;
 
 		/*
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5305f2283b5e..5a62763528d1 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode, u32 extent_thresh);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
 
 static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 {
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 427abaf608b8..2833e8ef4c09 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
 	start = round_down(start, fs_info->sectorsize);
 
 	btrfs_free_reserved_data_space_noquota(fs_info, len);
-	btrfs_qgroup_free_data(inode, reserved, start, len);
+	btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
 }
 
 /*
@@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 	} else {
 		if (current->journal_info)
 			flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
-		if (btrfs_transaction_in_commit(fs_info))
-			schedule_timeout(1);
 	}
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
@@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
 						 noflush);
 	if (ret)
 		return ret;
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+					   meta_reserve, flush);
 	if (ret) {
 		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
 		return ret;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 90aaedce1548..7381241334e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
 }
 
 /*
- * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * Look up the delayed item by key.
+ *
  * @delayed_node: pointer to the delayed node
  * @index:	  the dir index value to lookup (offset of a dir index key)
  *
@@ -517,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 		/*
 		 * For insertions we track reserved metadata space by accounting
 		 * for the number of leaves that will be used, based on the delayed
-		 * node's index_items_size field.
+		 * node's curr_index_batch_size and index_item_leaves fields.
 		 */
 		if (item->type == BTRFS_DELAYED_DELETION_ITEM)
 			item->bytes_reserved = num_bytes;
@@ -1030,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 	write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
 			    sizeof(struct btrfs_inode_item));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
 		goto out;
@@ -1378,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 		return -ENOMEM;
 
 	async_work->delayed_root = delayed_root;
-	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
-			NULL);
+	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
 	async_work->nr = nr;
 
 	btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1760,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 }
 
 /*
- * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
- *
+ * Read dir info stored in the delayed tree.
  */
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list)
@@ -1834,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_inode_block_group(inode_item, 0);
 
 	btrfs_set_stack_timespec_sec(&inode_item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_stack_timespec_sec(&inode_item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_stack_timespec_sec(&inode_item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_stack_timespec_nsec(&inode_item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
-	btrfs_set_stack_timespec_sec(&inode_item->otime,
-				     BTRFS_I(inode)->i_otime.tv_sec);
-	btrfs_set_stack_timespec_nsec(&inode_item->otime,
-				     BTRFS_I(inode)->i_otime.tv_nsec);
+	btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
+	btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
 }
 
 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1891,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 	btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
 				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 
-	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+	inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+			btrfs_stack_timespec_nsec(&inode_item->atime));
 
-	inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+	inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+			btrfs_stack_timespec_nsec(&inode_item->mtime));
 
 	inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
 			btrfs_stack_timespec_nsec(&inode_item->ctime));
 
-	BTRFS_I(inode)->i_otime.tv_sec =
-		btrfs_stack_timespec_sec(&inode_item->otime);
-	BTRFS_I(inode)->i_otime.tv_nsec =
-		btrfs_stack_timespec_nsec(&inode_item->otime);
+	BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+	BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
 
 	inode->i_generation = BTRFS_I(inode)->generation;
 	BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1914,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 }
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
 			       struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_delayed_node *delayed_node;
 	int ret = 0;
 
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1da213197f55..5cceb31bbd16 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
 
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
 			       struct btrfs_inode *inode);
 int btrfs_fill_inode(struct inode *inode, u32 *rdev);
 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9fe4ccca50a0..891ea2fa263c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
  * Release a ref head's reservation.
  *
  * @fs_info:  the filesystem
- * @nr:       number of items to drop
+ * @nr_refs:  number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
  *
  * Drops the delayed ref head's count from the delayed refs rsv and free any
  * excess reservation we had.
  */
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
-	const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
-	u64 released = 0;
+	u64 num_bytes;
+	u64 released;
+
+	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
 
 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
 	if (released)
@@ -77,26 +81,118 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 /*
  * Adjust the size of the delayed refs rsv.
  *
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
  */
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
 	u64 num_bytes;
+	u64 reserved_bytes;
 
-	if (!trans->delayed_ref_updates)
+	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+	num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+						       trans->delayed_ref_csum_deletions);
+
+	if (num_bytes == 0)
 		return;
 
-	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
-						 trans->delayed_ref_updates);
+	/*
+	 * Try to take num_bytes from the transaction's local delayed reserve.
+	 * If not possible, try to take as much as it's available. If the local
+	 * reserve doesn't have enough reserved space, the delayed refs reserve
+	 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
+	 * by someone or if a transaction commit is triggered before that, the
+	 * global block reserve will be used. We want to minimize using the
+	 * global block reserve for cases we can account for in advance, to
+	 * avoid exhausting it and reach -ENOSPC during a transaction commit.
+	 */
+	spin_lock(&local_rsv->lock);
+	reserved_bytes = min(num_bytes, local_rsv->reserved);
+	local_rsv->reserved -= reserved_bytes;
+	local_rsv->full = (local_rsv->reserved >= local_rsv->size);
+	spin_unlock(&local_rsv->lock);
 
 	spin_lock(&delayed_rsv->lock);
 	delayed_rsv->size += num_bytes;
-	delayed_rsv->full = false;
+	delayed_rsv->reserved += reserved_bytes;
+	delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
 	spin_unlock(&delayed_rsv->lock);
 	trans->delayed_ref_updates = 0;
+	trans->delayed_ref_csum_deletions = 0;
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * insertion, used after allocating a block group.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+	spin_lock(&delayed_rsv->lock);
+	/*
+	 * Inserting a block group item does not require changing the free space
+	 * tree, only the extent tree or the block group tree, so this is all we
+	 * need.
+	 */
+	delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
+	delayed_rsv->full = false;
+	spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item insertion.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+	u64 released;
+
+	released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * update.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+	spin_lock(&delayed_rsv->lock);
+	/*
+	 * Updating a block group item does not result in new nodes/leaves and
+	 * does not require changing the free space tree, only the extent tree
+	 * or the block group tree, so this is all we need.
+	 */
+	delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
+	delayed_rsv->full = false;
+	spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item update.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+	const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+	u64 released;
+
+	released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+	if (released > 0)
+		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+					      0, released, 0);
 }
 
 /*
@@ -154,6 +250,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 	u64 num_bytes = 0;
 	u64 refilled_bytes;
@@ -170,7 +267,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	if (!num_bytes)
 		return 0;
 
-	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+	ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
 	if (ret)
 		return ret;
 
@@ -199,8 +296,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 
 	if (to_free > 0)
-		btrfs_space_info_free_bytes_may_use(fs_info, block_rsv->space_info,
-						    to_free);
+		btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
 
 	if (refilled_bytes > 0)
 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -422,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
 	return 0;
 }
 
-static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
+				    struct btrfs_delayed_ref_root *delayed_refs,
 				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
@@ -433,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
 		list_del(&ref->add_list);
 	btrfs_put_delayed_ref(ref);
 	atomic_dec(&delayed_refs->num_entries);
+	btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 }
 
-static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_fs_info *fs_info,
+		      struct btrfs_delayed_ref_root *delayed_refs,
 		      struct btrfs_delayed_ref_head *head,
 		      struct btrfs_delayed_ref_node *ref,
 		      u64 seq)
@@ -464,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
 			mod = -next->ref_mod;
 		}
 
-		drop_delayed_ref(delayed_refs, head, next);
+		drop_delayed_ref(fs_info, delayed_refs, head, next);
 		ref->ref_mod += mod;
 		if (ref->ref_mod == 0) {
-			drop_delayed_ref(delayed_refs, head, ref);
+			drop_delayed_ref(fs_info, delayed_refs, head, ref);
 			done = true;
 		} else {
 			/*
@@ -505,7 +604,7 @@ again:
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		if (seq && ref->seq >= seq)
 			continue;
-		if (merge_ref(delayed_refs, head, ref, seq))
+		if (merge_ref(fs_info, delayed_refs, head, ref, seq))
 			goto again;
 	}
 }
@@ -584,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
  * Return true if the ref was merged into an existing one (and therefore can be
  * freed by the caller).
  */
-static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *ref)
 {
+	struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
 	struct btrfs_delayed_ref_node *exist;
 	int mod;
 
@@ -598,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
 			list_add_tail(&ref->add_list, &href->ref_add_list);
 		atomic_inc(&root->num_entries);
 		spin_unlock(&href->lock);
+		trans->delayed_ref_updates++;
 		return false;
 	}
 
@@ -626,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
 
 	/* remove existing tail if its ref_mod is zero */
 	if (exist->ref_mod == 0)
-		drop_delayed_ref(root, href, exist);
+		drop_delayed_ref(trans->fs_info, root, href, exist);
 	spin_unlock(&href->lock);
 	return true;
 }
@@ -647,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(existing->is_data != update->is_data);
 
 	spin_lock(&existing->lock);
+
+	/*
+	 * When freeing an extent, we may not know the owning root when we
+	 * first create the head_ref. However, some deref before the last deref
+	 * will know it, so we just need to update the head_ref accordingly.
+	 */
+	if (!existing->owning_root)
+		existing->owning_root = update->owning_root;
+
 	if (update->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
@@ -656,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 		 * Set it again here
 		 */
 		existing->must_insert_reserved = update->must_insert_reserved;
+		existing->owning_root = update->owning_root;
 
 		/*
 		 * update the num_bytes so we make sure the accounting
@@ -695,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 	/*
 	 * If we are going to from a positive ref mod to a negative or vice
 	 * versa we need to make sure to adjust pending_csums accordingly.
+	 * We reserve bytes for csum deletion when adding or updating a ref head
+	 * see add_delayed_ref_head() for more details.
 	 */
 	if (existing->is_data) {
 		u64 csum_leaves =
@@ -703,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
 
 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
 			delayed_refs->pending_csums -= existing->num_bytes;
-			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+			btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
 		}
 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
 			delayed_refs->pending_csums += existing->num_bytes;
-			trans->delayed_ref_updates += csum_leaves;
+			trans->delayed_ref_csum_deletions += csum_leaves;
 		}
 	}
 
@@ -718,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 				  struct btrfs_qgroup_extent_record *qrecord,
 				  u64 bytenr, u64 num_bytes, u64 ref_root,
 				  u64 reserved, int action, bool is_data,
-				  bool is_system)
+				  bool is_system, u64 owning_root)
 {
 	int count_mod = 1;
 	bool must_insert_reserved = false;
@@ -758,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
 	head_ref->bytenr = bytenr;
 	head_ref->num_bytes = num_bytes;
 	head_ref->ref_mod = count_mod;
+	head_ref->reserved_bytes = reserved;
 	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->owning_root = owning_root;
 	head_ref->is_data = is_data;
 	head_ref->is_system = is_system;
 	head_ref->ref_tree = RB_ROOT_CACHED;
@@ -819,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 		head_ref = existing;
 	} else {
+		/*
+		 * We reserve the amount of bytes needed to delete csums when
+		 * adding the ref head and not when adding individual drop refs
+		 * since the csum items are deleted only after running the last
+		 * delayed drop ref (the data extent's ref count drops to 0).
+		 */
 		if (head_ref->is_data && head_ref->ref_mod < 0) {
 			delayed_refs->pending_csums += head_ref->num_bytes;
-			trans->delayed_ref_updates +=
+			trans->delayed_ref_csum_deletions +=
 				btrfs_csum_bytes_to_leaves(trans->fs_info,
 							   head_ref->num_bytes);
 		}
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
 	}
 	if (qrecord_inserted_ret)
 		*qrecord_inserted_ret = qrecord_inserted;
@@ -837,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * init_delayed_ref_common - Initialize the structure which represents a
- *			     modification to a an extent.
+ * Initialize the structure which represents a modification to a an extent.
  *
  * @fs_info:    Internal to the mounted filesystem mount structure.
  *
@@ -909,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	u64 parent = generic_ref->parent;
 	u8 ref_type;
 
-	is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+	is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
 
 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -922,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -938,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-				generic_ref->tree_ref.owning_root, action,
+				generic_ref->tree_ref.ref_root, action,
 				ref_type);
-	ref->root = generic_ref->tree_ref.owning_root;
+	ref->root = generic_ref->tree_ref.ref_root;
 	ref->parent = parent;
 	ref->level = level;
 
 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-			      generic_ref->tree_ref.owning_root, 0, action,
-			      false, is_system);
+			      generic_ref->tree_ref.ref_root, 0, action,
+			      false, is_system, generic_ref->owning_root);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -959,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+	merged = insert_delayed_ref(trans, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -998,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	u64 bytenr = generic_ref->bytenr;
 	u64 num_bytes = generic_ref->len;
 	u64 parent = generic_ref->parent;
-	u64 ref_root = generic_ref->data_ref.owning_root;
+	u64 ref_root = generic_ref->data_ref.ref_root;
 	u64 owner = generic_ref->data_ref.ino;
 	u64 offset = generic_ref->data_ref.offset;
 	u8 ref_type;
@@ -1026,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	}
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    !generic_ref->skip_qgroup) {
+	if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
 		record = kzalloc(sizeof(*record), GFP_NOFS);
 		if (!record) {
 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -1038,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	}
 
 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
-			      reserved, action, true, false);
+			      reserved, action, true, false, generic_ref->owning_root);
 	head_ref->extent_op = NULL;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -1051,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	head_ref = add_delayed_ref_head(trans, head_ref, record,
 					action, &qrecord_inserted);
 
-	merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+	merged = insert_delayed_ref(trans, head_ref, &ref->node);
 	spin_unlock(&delayed_refs->lock);
 
 	/*
@@ -1084,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
-			      BTRFS_UPDATE_DELAYED_HEAD, false, false);
+			      BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index fd9bf2b709c0..62d679d40f4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -9,10 +9,16 @@
 #include <linux/refcount.h>
 
 /* these are the possible values of struct btrfs_delayed_ref_node->action */
-#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+enum btrfs_delayed_ref_action {
+	/* Add one backref to the tree */
+	BTRFS_ADD_DELAYED_REF = 1,
+	/* Delete one backref from the tree */
+	BTRFS_DROP_DELAYED_REF,
+	/* Record a full extent allocation */
+	BTRFS_ADD_DELAYED_EXTENT,
+	/* Not changing ref count on head ref */
+	BTRFS_UPDATE_DELAYED_HEAD,
+} __packed;
 
 struct btrfs_delayed_ref_node {
 	struct rb_node ref_node;
@@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head {
 	int ref_mod;
 
 	/*
+	 * The root that triggered the allocation when must_insert_reserved is
+	 * set to true.
+	 */
+	u64 owning_root;
+
+	/*
+	 * Track reserved bytes when setting must_insert_reserved.  On success
+	 * or cleanup, we will need to free the reservation.
+	 */
+	u64 reserved_bytes;
+
+	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
 	 * until the delayed ref is processed.  must_insert_reserved is
@@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head {
 	 * the free has happened.
 	 */
 	bool must_insert_reserved;
+
 	bool is_data;
 	bool is_system;
 	bool processing;
@@ -183,13 +202,13 @@ enum btrfs_ref_type {
 	BTRFS_REF_DATA,
 	BTRFS_REF_METADATA,
 	BTRFS_REF_LAST,
-};
+} __packed;
 
 struct btrfs_data_ref {
 	/* For EXTENT_DATA_REF */
 
-	/* Original root this data extent belongs to */
-	u64 owning_root;
+	/* Root which owns this data reference. */
+	u64 ref_root;
 
 	/* Inode which refers to this data extent */
 	u64 ino;
@@ -212,18 +231,18 @@ struct btrfs_tree_ref {
 	int level;
 
 	/*
-	 * Root which owns this tree block.
+	 * Root which owns this tree block reference.
 	 *
 	 * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
 	 */
-	u64 owning_root;
+	u64 ref_root;
 
 	/* For non-skinny metadata, no special member needed */
 };
 
 struct btrfs_ref {
 	enum btrfs_ref_type type;
-	int action;
+	enum btrfs_delayed_ref_action action;
 
 	/*
 	 * Whether this extent should go through qgroup record.
@@ -239,6 +258,7 @@ struct btrfs_ref {
 #endif
 	u64 bytenr;
 	u64 len;
+	u64 owning_root;
 
 	/* Bytenr of the parent tree block */
 	u64 parent;
@@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
 	return num_bytes;
 }
 
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+						    int num_csum_items)
+{
+	/*
+	 * Deleting csum items does not result in new nodes/leaves and does not
+	 * require changing the free space tree, only the csum tree, so this is
+	 * all we need.
+	 */
+	return btrfs_calc_metadata_size(fs_info, num_csum_items);
+}
+
 static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
-				int action, u64 bytenr, u64 len, u64 parent)
+					  int action, u64 bytenr, u64 len,
+					  u64 parent, u64 owning_root)
 {
 	generic_ref->action = action;
 	generic_ref->bytenr = bytenr;
 	generic_ref->len = len;
 	generic_ref->parent = parent;
+	generic_ref->owning_root = owning_root;
 }
 
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
-				int level, u64 root, u64 mod_root, bool skip_qgroup)
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
+				       u64 root, u64 mod_root, bool skip_qgroup)
 {
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: root;
 #endif
 	generic_ref->tree_ref.level = level;
-	generic_ref->tree_ref.owning_root = root;
+	generic_ref->tree_ref.ref_root = root;
 	generic_ref->type = BTRFS_REF_METADATA;
 	if (skip_qgroup || !(is_fstree(root) &&
 			     (!mod_root || is_fstree(mod_root))))
@@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
 	/* If @real_root not set, use @root as fallback */
 	generic_ref->real_root = mod_root ?: ref_root;
 #endif
-	generic_ref->data_ref.owning_root = ref_root;
+	generic_ref->data_ref.ref_root = ref_root;
 	generic_ref->data_ref.ino = ino;
 	generic_ref->data_ref.offset = offset;
 	generic_ref->type = BTRFS_REF_DATA;
@@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
-	WARN_ON(refcount_read(&ref->refs) == 0);
 	if (refcount_dec_and_test(&ref->refs)) {
 		WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
 		switch (ref->type) {
@@ -402,8 +434,12 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush);
 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index fff22ed55c42..f9544fda38e9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -17,7 +17,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
-#include "check-integrity.h"
 #include "dev-replace.h"
 #include "sysfs.h"
 #include "zoned.h"
@@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
@@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 	}
 
-	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
-				  fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+					fs_info->bdev_holder, NULL);
+	if (IS_ERR(bdev_handle)) {
 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
-		return PTR_ERR(bdev);
+		return PTR_ERR(bdev_handle);
 	}
+	bdev = bdev_handle->bdev;
 
 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 		btrfs_err(fs_info,
@@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-	device->holder = fs_info->bdev_holder;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_devices;
@@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	return 0;
 
 error:
-	blkdev_put(bdev, fs_info->bdev_holder);
+	bdev_release(bdev_handle);
 	return ret;
 }
 
@@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 	dev_replace->item_needs_writeback = 0;
 	up_write(&dev_replace->rwsem);
 
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 082eb0e19598..9c07d5c3e5ad 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		btrfs_extend_item(path, data_size);
+		btrfs_extend_item(trans, path, data_size);
 	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
@@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 	write_extent_buffer(leaf, name, name_ptr, name_len);
 	write_extent_buffer(leaf, data, data_ptr, data_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 	return ret;
 }
@@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 	name_ptr = (unsigned long)(dir_item + 1);
 
 	write_extent_buffer(leaf, name->name, name_ptr, name->len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
@@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		btrfs_truncate_item(path, item_len - sub_item_len, 1);
+		btrfs_truncate_item(trans, path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index aab4b7cc7fa0..e40a226373d7 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,6 +3,10 @@
 #ifndef BTRFS_DIR_ITEM_H
 #define BTRFS_DIR_ITEM_H
 
+#include <linux/crc32c.h>
+
+struct fscrypt_str;
+
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 			  const struct fscrypt_str *name);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
 						 const char *name,
 						 int name_len);
 
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+       return crc32c((u32)~1, name, len);
+}
+
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 68f60d50e1fd..62cb97f7c94f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
-#include "check-integrity.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "raid56.h"
@@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 	struct extent_buffer *eb = bbio->private;
 	struct btrfs_fs_info *fs_info = eb->fs_info;
 	u64 found_start = btrfs_header_bytenr(eb);
+	u64 last_trans;
 	u8 result[BTRFS_CSUM_SIZE];
 	int ret;
 
@@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
 	 * Also check the generation, the eb reached here must be newer than
 	 * last committed. Or something seriously wrong happened.
 	 */
-	if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+	last_trans = btrfs_get_last_trans_committed(fs_info);
+	if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
 		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			"block=%llu bad generation, have %llu expect > %llu",
-			  eb->start, btrfs_header_generation(eb),
-			  fs_info->last_trans_committed);
+			  eb->start, btrfs_header_generation(eb), last_trans);
 		goto error;
 	}
 	write_extent_buffer(eb, result, 0, fs_info->csum_size);
@@ -318,9 +318,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
 			   BTRFS_FSID_SIZE);
 
 	/*
-	 * alloc_fs_devices() copies the fsid into metadata_uuid if the
-	 * metadata_uuid is unset in the superblock, including for a seed device.
-	 * So, we can use fs_devices->metadata_uuid.
+	 * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
+	 * This is then overwritten by metadata_uuid if it is present in the
+	 * device_list_add(). The same true for a seed device as well. So use of
+	 * fs_devices::metadata_uuid is appropriate here.
 	 */
 	if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
 		return false;
@@ -675,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	refcount_set(&root->refs, 1);
 	atomic_set(&root->snapshot_force_cow, 0);
 	atomic_set(&root->nr_swapfiles, 0);
-	root->log_transid = 0;
+	btrfs_set_root_log_transid(root, 0);
 	root->log_transid_committed = -1;
-	root->last_log_commit = 0;
+	btrfs_set_root_last_log_commit(root, 0);
 	root->anon_dev = 0;
 	if (!dummy) {
 		extent_io_tree_init(fs_info, &root->dirty_log_pages,
@@ -859,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	root->root_key.offset = 0;
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
-				      BTRFS_NESTING_NORMAL);
+				      0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		leaf = NULL;
@@ -867,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	}
 
 	root->node = leaf;
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	root->commit_root = btrfs_root_node(root);
 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -936,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
 	 */
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
-			NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+			NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
 	root->node = leaf;
 
-	btrfs_mark_buffer_dirty(root->node);
+	btrfs_mark_buffer_dirty(trans, root->node);
 	btrfs_tree_unlock(root->node);
 
 	return 0;
@@ -1004,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
-	root->log_transid = 0;
+	btrfs_set_root_log_transid(root, 0);
 	root->log_transid_committed = -1;
-	root->last_log_commit = 0;
+	btrfs_set_root_last_log_commit(root, 0);
 	return 0;
 }
 
@@ -1179,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 		return btrfs_grab_root(fs_info->block_group_root);
 	case BTRFS_FREE_SPACE_TREE_OBJECTID:
 		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
+		return btrfs_grab_root(fs_info->stripe_root);
 	default:
 		return NULL;
 	}
@@ -1259,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_put_root(fs_info->block_group_root);
+	btrfs_put_root(fs_info->stripe_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -1402,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * Return a root for the given objectid.
+ *
  * @fs_info:	the fs_info
  * @objectid:	the objectid we need to lookup
  *
@@ -1699,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 }
 
 /*
- * read_backup_root - Reads a backup root based on the passed priority. Prio 0
- * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
+ * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
  *
- * fs_info - filesystem whose backup roots need to be read
- * priority - priority of backup root required
+ * @fs_info:  filesystem whose backup roots need to be read
+ * @priority: priority of backup root required
  *
  * Returns backup root index on success and -EINVAL otherwise.
  */
@@ -1803,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
 	free_root_extent_buffers(info->block_group_root);
+	free_root_extent_buffers(info->stripe_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2262,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (!IS_ERR(root)) {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 		fs_info->quota_root = root;
 	}
 
@@ -2279,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		fs_info->uuid_root = root;
 	}
 
+	if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+		location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+		root = btrfs_read_tree_root(tree_root, &location);
+		if (IS_ERR(root)) {
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+				ret = PTR_ERR(root);
+				goto out;
+			}
+		} else {
+			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+			fs_info->stripe_root = root;
+		}
+	}
+
 	return 0;
 out:
 	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2381,7 +2400,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 
-	if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+	if (!fs_info->fs_devices->temp_fsid &&
+	    memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
 		btrfs_err(fs_info,
 		"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
 			  sb->fsid, fs_info->fs_devices->fsid);
@@ -2634,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
 
 		/* All successful */
 		fs_info->generation = btrfs_header_generation(tree_root->node);
-		fs_info->last_trans_committed = fs_info->generation;
+		btrfs_set_last_trans_committed(fs_info, fs_info->generation);
 		fs_info->last_reloc_trans = 0;
 
 		/* Always begin writing backup roots after the one being used */
@@ -2735,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 	spin_lock_init(&fs_info->ordered_root_lock);
 
 	btrfs_init_scrub(fs_info);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	fs_info->check_integrity_print_mask = 0;
-#endif
 	btrfs_init_balance(fs_info);
 	btrfs_init_async_reclaim_work(fs_info);
 
@@ -3157,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	u32 nodesize;
 	u32 stripesize;
 	u64 generation;
-	u64 features;
 	u16 csum_type;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -3197,6 +3213,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 		goto fail_alloc;
 	}
 
+	btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
 	/*
 	 * Verify the type first, if that or the checksum value are
 	 * corrupted, we'll find out
@@ -3239,15 +3256,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 
 	disk_super = fs_info->super_copy;
 
-
-	features = btrfs_super_flags(disk_super);
-	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
-		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
-		btrfs_set_super_flags(disk_super, features);
-		btrfs_info(fs_info,
-			"found metadata UUID change in progress flag, clearing");
-	}
-
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_for_commit));
 
@@ -3509,18 +3517,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 				   "auto enabling async discard");
 	}
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
-		ret = btrfsic_mount(fs_info, fs_devices,
-				    btrfs_test_opt(fs_info,
-					CHECK_INTEGRITY_DATA) ? 1 : 0,
-				    fs_info->check_integrity_print_mask);
-		if (ret)
-			btrfs_warn(fs_info,
-				"failed to initialize integrity check module: %d",
-				ret);
-	}
-#endif
 	ret = btrfs_read_qgroup_config(fs_info);
 	if (ret)
 		goto fail_trans_kthread;
@@ -3820,8 +3816,6 @@ static int write_dev_supers(struct btrfs_device *device,
 		 */
 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
 			bio->bi_opf |= REQ_FUA;
-
-		btrfsic_check_bio(bio);
 		submit_bio(bio);
 
 		if (btrfs_advance_sb_log(device, i))
@@ -3917,28 +3911,11 @@ static void write_dev_flush(struct btrfs_device *device)
 
 	device->last_flush_error = BLK_STS_OK;
 
-#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	/*
-	 * When a disk has write caching disabled, we skip submission of a bio
-	 * with flush and sync requests before writing the superblock, since
-	 * it's not needed. However when the integrity checker is enabled, this
-	 * results in reports that there are metadata blocks referred by a
-	 * superblock that were not properly flushed. So don't skip the bio
-	 * submission only when the integrity checker is enabled for the sake
-	 * of simplicity, since this is a debug tool and not meant for use in
-	 * non-debug builds.
-	 */
-	if (!bdev_write_cache(device->bdev))
-		return;
-#endif
-
 	bio_init(bio, device->bdev, NULL, 0,
 		 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
-
-	btrfsic_check_bio(bio);
 	submit_bio(bio);
 	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
 }
@@ -4414,16 +4391,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
 
 	iput(fs_info->btree_inode);
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
-		btrfsic_unmount(fs_info->fs_devices);
-#endif
-
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 	btrfs_close_devices(fs_info->fs_devices);
 }
 
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+			     struct extent_buffer *buf)
 {
 	struct btrfs_fs_info *fs_info = buf->fs_info;
 	u64 transid = btrfs_header_generation(buf);
@@ -4437,21 +4410,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
 		return;
 #endif
+	/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
+	ASSERT(trans->transid == fs_info->generation);
 	btrfs_assert_tree_write_locked(buf);
-	if (transid != fs_info->generation)
-		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
-			buf->start, transid, fs_info->generation);
-	set_extent_buffer_dirty(buf);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	/*
-	 * btrfs_check_leaf() won't check item data if we don't have WRITTEN
-	 * set, so this will only validate the basic structure of the items.
-	 */
-	if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
-		btrfs_print_leaf(buf);
-		ASSERT(0);
+	if (unlikely(transid != fs_info->generation)) {
+		btrfs_abort_transaction(trans, -EUCLEAN);
+		btrfs_crit(fs_info,
+"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
+			   buf->start, transid, fs_info->generation);
 	}
-#endif
+	set_extent_buffer_dirty(buf);
 }
 
 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
@@ -4611,6 +4579,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				list_del(&ref->add_list);
 			atomic_dec(&delayed_refs->num_entries);
 			btrfs_put_delayed_ref(ref);
+			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
 		}
 		if (head->must_insert_reserved)
 			pin_bytes = true;
@@ -4808,7 +4777,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 
 		spin_unlock(&cur_trans->dirty_bgs_lock);
 		btrfs_put_block_group(cache);
-		btrfs_delayed_refs_rsv_release(fs_info, 1);
+		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
 		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4830,6 +4799,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 	}
 }
 
+static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *gang[8];
+	int i;
+	int ret;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	while (1) {
+		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_TRANS_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			struct btrfs_root *root = gang[i];
+
+			btrfs_qgroup_free_meta_all_pertrans(root);
+			radix_tree_tag_clear(&fs_info->fs_roots_radix,
+					(unsigned long)root->root_key.objectid,
+					BTRFS_ROOT_TRANS_TAG);
+		}
+	}
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+}
+
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_fs_info *fs_info)
 {
@@ -4858,6 +4853,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				     EXTENT_DIRTY);
 	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
 
+	btrfs_free_all_qgroup_pertrans(fs_info);
+
 	cur_trans->state =TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 02b645744a82..50dab8f639dc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -104,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 }
 
 void btrfs_put_root(struct btrfs_root *root);
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+			     struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_read_extent_buffer(struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ff8e117a1ace..ea149be28dff 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
 }
 
+/*
+ * Empty an io tree, removing and freeing every extent state record from the
+ * tree. This should be called once we are sure no other task can access the
+ * tree anymore, so no tree updates happen after we empty the tree and there
+ * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
+ * set on any extent state when calling this function).
+ */
 void extent_io_tree_release(struct extent_io_tree *tree)
 {
+	struct rb_root root;
+	struct extent_state *state;
+	struct extent_state *tmp;
+
 	spin_lock(&tree->lock);
-	/*
-	 * Do a single barrier for the waitqueue_active check here, the state
-	 * of the waitqueue should not change once extent_io_tree_release is
-	 * called.
-	 */
-	smp_mb();
-	while (!RB_EMPTY_ROOT(&tree->state)) {
-		struct rb_node *node;
-		struct extent_state *state;
-
-		node = rb_first(&tree->state);
-		state = rb_entry(node, struct extent_state, rb_node);
-		rb_erase(&state->rb_node, &tree->state);
+	root = tree->state;
+	tree->state = RB_ROOT;
+	rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
+		/* Clear node to keep free_extent_state() happy. */
 		RB_CLEAR_NODE(&state->rb_node);
+		ASSERT(!(state->state & EXTENT_LOCKED));
 		/*
-		 * btree io trees aren't supposed to have tasks waiting for
-		 * changes in the flags of extent states ever.
+		 * No need for a memory barrier here, as we are holding the tree
+		 * lock and we only change the waitqueue while holding that lock
+		 * (see wait_extent_bit()).
 		 */
 		ASSERT(!waitqueue_active(&state->wq));
 		free_extent_state(state);
-
 		cond_resched_lock(&tree->lock);
 	}
+	/*
+	 * Should still be empty even after a reschedule, no other task should
+	 * be accessing the tree anymore.
+	 */
+	ASSERT(RB_EMPTY_ROOT(&tree->state));
 	spin_unlock(&tree->lock);
 }
 
@@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 	"locking error: extent tree was modified by another thread while locked");
 }
 
+static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+	struct extent_state *prev;
+
+	prev = prev_state(state);
+	if (prev && prev->end == state->start - 1 && prev->state == state->state) {
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(tree->inode, state, prev);
+		state->start = prev->start;
+		rb_erase(&prev->rb_node, &tree->state);
+		RB_CLEAR_NODE(&prev->rb_node);
+		free_extent_state(prev);
+	}
+}
+
+static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+	struct extent_state *next;
+
+	next = next_state(state);
+	if (next && next->start == state->end + 1 && next->state == state->state) {
+		if (tree->inode)
+			btrfs_merge_delalloc_extent(tree->inode, state, next);
+		state->end = next->end;
+		rb_erase(&next->rb_node, &tree->state);
+		RB_CLEAR_NODE(&next->rb_node);
+		free_extent_state(next);
+	}
+}
+
 /*
  * Utility function to look for merge candidates inside a given range.  Any
  * extents with matching state are merged together into a single extent in the
@@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  */
 static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
 {
-	struct extent_state *other;
-
 	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 		return;
 
-	other = prev_state(state);
-	if (other && other->end == state->start - 1 &&
-	    other->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, other);
-		state->start = other->start;
-		rb_erase(&other->rb_node, &tree->state);
-		RB_CLEAR_NODE(&other->rb_node);
-		free_extent_state(other);
-	}
-	other = next_state(state);
-	if (other && other->start == state->end + 1 &&
-	    other->state == state->state) {
-		if (tree->inode)
-			btrfs_merge_delalloc_extent(tree->inode, state, other);
-		state->end = other->end;
-		rb_erase(&other->rb_node, &tree->state);
-		RB_CLEAR_NODE(&other->rb_node);
-		free_extent_state(other);
-	}
+	merge_prev_state(tree, state);
+	merge_next_state(tree, state);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree,
  * Insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
  *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
+ * Returns a pointer to the struct extent_state record containing the range
+ * requested for insertion, which may be the same as the given struct or it
+ * may be an existing record in the tree that was expanded to accommodate the
+ * requested range. In case of an extent_state different from the one that was
+ * given, the later can be freed or reused by the caller.
+ *
+ * On error it returns an error pointer.
  *
  * The tree lock is not taken internally.  This is a utility function and
  * probably isn't what you want to call (see set/clear_extent_bit).
  */
-static int insert_state(struct extent_io_tree *tree,
-			struct extent_state *state,
-			u32 bits, struct extent_changeset *changeset)
+static struct extent_state *insert_state(struct extent_io_tree *tree,
+					 struct extent_state *state,
+					 u32 bits,
+					 struct extent_changeset *changeset)
 {
 	struct rb_node **node;
 	struct rb_node *parent = NULL;
-	const u64 end = state->end;
+	const u64 start = state->start - 1;
+	const u64 end = state->end + 1;
+	const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
 
 	set_state_bits(tree, state, bits, changeset);
 
@@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree,
 		parent = *node;
 		entry = rb_entry(parent, struct extent_state, rb_node);
 
-		if (end < entry->start) {
+		if (state->end < entry->start) {
+			if (try_merge && end == entry->start &&
+			    state->state == entry->state) {
+				if (tree->inode)
+					btrfs_merge_delalloc_extent(tree->inode,
+								    state, entry);
+				entry->start = state->start;
+				merge_prev_state(tree, entry);
+				state->state = 0;
+				return entry;
+			}
 			node = &(*node)->rb_left;
-		} else if (end > entry->end) {
+		} else if (state->end > entry->end) {
+			if (try_merge && entry->end == start &&
+			    state->state == entry->state) {
+				if (tree->inode)
+					btrfs_merge_delalloc_extent(tree->inode,
+								    state, entry);
+				entry->end = state->end;
+				merge_next_state(tree, entry);
+				state->state = 0;
+				return entry;
+			}
 			node = &(*node)->rb_right;
 		} else {
 			btrfs_err(tree->fs_info,
 			       "found node %llu %llu on insert of %llu %llu",
-			       entry->start, entry->end, state->start, end);
-			return -EEXIST;
+			       entry->start, entry->end, state->start, state->end);
+			return ERR_PTR(-EEXIST);
 		}
 	}
 
 	rb_link_node(&state->rb_node, parent, node);
 	rb_insert_color(&state->rb_node, &tree->state);
 
-	merge_state(tree, state);
-	return 0;
+	return state;
 }
 
 /*
@@ -708,26 +753,13 @@ out:
 
 }
 
-static void wait_on_state(struct extent_io_tree *tree,
-			  struct extent_state *state)
-		__releases(tree->lock)
-		__acquires(tree->lock)
-{
-	DEFINE_WAIT(wait);
-	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock(&tree->lock);
-	schedule();
-	spin_lock(&tree->lock);
-	finish_wait(&state->wq, &wait);
-}
-
 /*
  * Wait for one or more bits to clear on a range in the state tree.
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
-		     struct extent_state **cached_state)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    u32 bits, struct extent_state **cached_state)
 {
 	struct extent_state *state;
 
@@ -758,9 +790,15 @@ process_node:
 			goto out;
 
 		if (state->state & bits) {
+			DEFINE_WAIT(wait);
+
 			start = state->start;
 			refcount_inc(&state->refs);
-			wait_on_state(tree, state);
+			prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+			spin_unlock(&tree->lock);
+			schedule();
+			spin_lock(&tree->lock);
+			finish_wait(&state->wq, &wait);
 			free_extent_state(state);
 			goto again;
 		}
@@ -847,10 +885,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 		if (state->end == start - 1 && extent_state_in_tree(state)) {
 			while ((state = next_state(state)) != NULL) {
 				if (state->state & bits)
-					goto got_it;
+					break;
 			}
+			/*
+			 * If we found the next extent state, clear cached_state
+			 * so that we can cache the next extent state below and
+			 * avoid future calls going over the same extent state
+			 * again. If we haven't found any, clear as well since
+			 * it's now useless.
+			 */
 			free_extent_state(*cached_state);
 			*cached_state = NULL;
+			if (state)
+				goto got_it;
 			goto out;
 		}
 		free_extent_state(*cached_state);
@@ -1133,6 +1180,8 @@ hit_next:
 	 */
 	if (state->start > start) {
 		u64 this_end;
+		struct extent_state *inserted_state;
+
 		if (end < last_start)
 			this_end = end;
 		else
@@ -1148,12 +1197,15 @@ hit_next:
 		 */
 		prealloc->start = start;
 		prealloc->end = this_end;
-		err = insert_state(tree, prealloc, bits, changeset);
-		if (err)
+		inserted_state = insert_state(tree, prealloc, bits, changeset);
+		if (IS_ERR(inserted_state)) {
+			err = PTR_ERR(inserted_state);
 			extent_io_tree_panic(tree, err);
+		}
 
-		cache_state(prealloc, cached_state);
-		prealloc = NULL;
+		cache_state(inserted_state, cached_state);
+		if (inserted_state == prealloc)
+			prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
@@ -1356,6 +1408,8 @@ hit_next:
 	 */
 	if (state->start > start) {
 		u64 this_end;
+		struct extent_state *inserted_state;
+
 		if (end < last_start)
 			this_end = end;
 		else
@@ -1373,11 +1427,14 @@ hit_next:
 		 */
 		prealloc->start = start;
 		prealloc->end = this_end;
-		err = insert_state(tree, prealloc, bits, NULL);
-		if (err)
+		inserted_state = insert_state(tree, prealloc, bits, NULL);
+		if (IS_ERR(inserted_state)) {
+			err = PTR_ERR(inserted_state);
 			extent_io_tree_panic(tree, err);
-		cache_state(prealloc, cached_state);
-		prealloc = NULL;
+		}
+		cache_state(inserted_state, cached_state);
+		if (inserted_state == prealloc)
+			prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
@@ -1640,15 +1697,46 @@ search:
 }
 
 /*
- * Search a range in the state tree for a given mask.  If 'filled' == 1, this
- * returns 1 only if every extent in the tree has the bits set.  Otherwise, 1
- * is returned if any bit in the range is found set.
+ * Check if the single @bit exists in the given range.
+ */
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+{
+	struct extent_state *state = NULL;
+	bool bitset = false;
+
+	ASSERT(is_power_of_2(bit));
+
+	spin_lock(&tree->lock);
+	state = tree_search(tree, start);
+	while (state && start <= end) {
+		if (state->start > end)
+			break;
+
+		if (state->state & bit) {
+			bitset = true;
+			break;
+		}
+
+		/* If state->end is (u64)-1, start will overflow to 0 */
+		start = state->end + 1;
+		if (start > end || start == 0)
+			break;
+		state = next_state(state);
+	}
+	spin_unlock(&tree->lock);
+	return bitset;
+}
+
+/*
+ * Check if the whole range [@start,@end) contains the single @bit set.
  */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   u32 bits, int filled, struct extent_state *cached)
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+		    struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
-	int bitset = 0;
+	bool bitset = true;
+
+	ASSERT(is_power_of_2(bit));
 
 	spin_lock(&tree->lock);
 	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	else
 		state = tree_search(tree, start);
 	while (state && start <= end) {
-		if (filled && state->start > start) {
-			bitset = 0;
+		if (state->start > start) {
+			bitset = false;
 			break;
 		}
 
 		if (state->start > end)
 			break;
 
-		if (state->state & bits) {
-			bitset = 1;
-			if (!filled)
-				break;
-		} else if (filled) {
-			bitset = 0;
+		if ((state->state & bit) == 0) {
+			bitset = false;
 			break;
 		}
 
 		if (state->end == (u64)-1)
 			break;
 
+		/*
+		 * Last entry (if state->end is (u64)-1 and overflow happens),
+		 * or next entry starts after the range.
+		 */
 		start = state->end + 1;
-		if (start > end)
+		if (start > end || start == 0)
 			break;
 		state = next_state(state);
 	}
 
 	/* We ran out of states and were still inside of our range. */
-	if (filled && !state)
-		bitset = 0;
+	if (!state)
+		bitset = false;
 	spin_unlock(&tree->lock);
 	return bitset;
 }
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 28c23a23d121..5602b0137fcd 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		     struct extent_state **cached_state);
 
 void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   u32 bits, int filled, struct extent_state *cached_state);
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+		    struct extent_state *cached_state);
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			     u32 bits, struct extent_changeset *changeset);
 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
 			       u64 *end, u64 max_bytes,
 			       struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
-		     struct extent_state **cached_state);
 
 #endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc313fce5bbd..01423670bc8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,14 +42,16 @@
 #include "file-item.h"
 #include "orphan.h"
 #include "tree-checker.h"
+#include "raid-stripe-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node, u64 parent,
 			       u64 root_objectid, u64 owner_objectid,
-			       u64 owner_offset, int refs_to_drop,
+			       u64 owner_offset,
 			       struct btrfs_delayed_extent_op *extra_op);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 				    struct extent_buffer *leaf,
@@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      u64 parent, u64 root_objectid,
 				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod);
+				      struct btrfs_key *ins, int ref_mod, u64 oref_root);
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_ref_node *node,
 				     struct btrfs_delayed_extent_op *extent_op);
@@ -100,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 offset, int metadata, u64 *refs, u64 *flags)
+			     u64 offset, int metadata, u64 *refs, u64 *flags,
+			     u64 *owning_root)
 {
 	struct btrfs_root *extent_root;
 	struct btrfs_delayed_ref_head *head;
@@ -112,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	u64 num_refs;
 	u64 extent_flags;
+	u64 owner = 0;
 	int ret;
 
 	/*
@@ -165,6 +169,8 @@ search_again:
 					    struct btrfs_extent_item);
 			num_refs = btrfs_extent_refs(leaf, ei);
 			extent_flags = btrfs_extent_flags(leaf, ei);
+			owner = btrfs_get_extent_owner_root(fs_info, leaf,
+							    path->slots[0]);
 		} else {
 			ret = -EUCLEAN;
 			btrfs_err(fs_info,
@@ -224,6 +230,8 @@ out:
 		*refs = num_refs;
 	if (flags)
 		*flags = extent_flags;
+	if (owning_root)
+		*owning_root = owner;
 out_free:
 	btrfs_free_path(path);
 	return ret;
@@ -344,9 +352,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     struct btrfs_extent_inline_ref *iref,
 				     enum btrfs_inline_ref_type is_data)
 {
+	struct btrfs_fs_info *fs_info = eb->fs_info;
 	int type = btrfs_extent_inline_ref_type(eb, iref);
 	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+		return type;
+	}
+
 	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
 	    type == BTRFS_SHARED_DATA_REF_KEY ||
@@ -355,26 +369,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 			if (type == BTRFS_TREE_BLOCK_REF_KEY)
 				return type;
 			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
-				ASSERT(eb->fs_info);
+				ASSERT(fs_info);
 				/*
 				 * Every shared one has parent tree block,
 				 * which must be aligned to sector size.
 				 */
-				if (offset &&
-				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
+				if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
 					return type;
 			}
 		} else if (is_data == BTRFS_REF_TYPE_DATA) {
 			if (type == BTRFS_EXTENT_DATA_REF_KEY)
 				return type;
 			if (type == BTRFS_SHARED_DATA_REF_KEY) {
-				ASSERT(eb->fs_info);
+				ASSERT(fs_info);
 				/*
 				 * Every shared one has parent tree block,
 				 * which must be aligned to sector size.
 				 */
 				if (offset &&
-				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
+				    IS_ALIGNED(offset, fs_info->sectorsize))
 					return type;
 			}
 		} else {
@@ -385,7 +398,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 
 	WARN_ON(1);
 	btrfs_print_leaf(eb);
-	btrfs_err(eb->fs_info,
+	btrfs_err(fs_info,
 		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
 		  eb->start, (unsigned long)iref, type);
 
@@ -399,11 +412,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -575,7 +588,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 		}
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	ret = 0;
 fail:
 	btrfs_release_path(path);
@@ -623,7 +636,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	}
 	return ret;
 }
@@ -789,7 +802,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	int type;
 	int want;
 	int ret;
-	int err = 0;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 	int needed;
 
@@ -816,10 +828,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 
 again:
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
-	if (ret < 0) {
-		err = ret;
+	if (ret < 0)
 		goto out;
-	}
 
 	/*
 	 * We may be a newly converted file system which still has the old fat
@@ -846,7 +856,7 @@ again:
 	}
 
 	if (ret && !insert) {
-		err = -ENOENT;
+		ret = -ENOENT;
 		goto out;
 	} else if (WARN_ON(ret)) {
 		btrfs_print_leaf(path->nodes[0]);
@@ -854,18 +864,18 @@ again:
 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
 			  bytenr, num_bytes, parent, root_objectid, owner,
 			  offset);
-		err = -EIO;
+		ret = -EUCLEAN;
 		goto out;
 	}
 
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 	if (unlikely(item_size < sizeof(*ei))) {
-		err = -EUCLEAN;
+		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			  "unexpected extent item size, has %llu expect >= %zu",
 			  item_size, sizeof(*ei));
-		btrfs_abort_transaction(trans, err);
+		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
@@ -885,22 +895,17 @@ again:
 	else
 		needed = BTRFS_REF_TYPE_BLOCK;
 
-	err = -ENOENT;
-	while (1) {
-		if (ptr >= end) {
-			if (ptr > end) {
-				err = -EUCLEAN;
-				btrfs_print_leaf(path->nodes[0]);
-				btrfs_crit(fs_info,
-"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
-					path->slots[0], root_objectid, owner, offset, parent);
-			}
-			break;
-		}
+	ret = -ENOENT;
+	while (ptr < end) {
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+		if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+			ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
 		if (type == BTRFS_REF_TYPE_INVALID) {
-			err = -EUCLEAN;
+			ret = -EUCLEAN;
 			goto out;
 		}
 
@@ -916,7 +921,7 @@ again:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 			if (match_extent_data_ref(leaf, dref, root_objectid,
 						  owner, offset)) {
-				err = 0;
+				ret = 0;
 				break;
 			}
 			if (hash_extent_data_ref_item(leaf, dref) <
@@ -927,14 +932,14 @@ again:
 			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 			if (parent > 0) {
 				if (parent == ref_offset) {
-					err = 0;
+					ret = 0;
 					break;
 				}
 				if (ref_offset < parent)
 					break;
 			} else {
 				if (root_objectid == ref_offset) {
-					err = 0;
+					ret = 0;
 					break;
 				}
 				if (ref_offset < root_objectid)
@@ -943,10 +948,20 @@ again:
 		}
 		ptr += btrfs_extent_inline_ref_size(type);
 	}
-	if (err == -ENOENT && insert) {
+
+	if (unlikely(ptr > end)) {
+		ret = -EUCLEAN;
+		btrfs_print_leaf(path->nodes[0]);
+		btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+			   path->slots[0], root_objectid, owner, offset, parent);
+		goto out;
+	}
+
+	if (ret == -ENOENT && insert) {
 		if (item_size + extra_size >=
 		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
-			err = -EAGAIN;
+			ret = -EAGAIN;
 			goto out;
 		}
 		/*
@@ -958,7 +973,7 @@ again:
 		if (find_next_key(path, 0, &key) == 0 &&
 		    key.objectid == bytenr &&
 		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			err = -EAGAIN;
+			ret = -EAGAIN;
 			goto out;
 		}
 	}
@@ -969,14 +984,14 @@ out:
 		path->search_for_extension = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
-	return err;
+	return ret;
 }
 
 /*
  * helper to add new inline back ref
  */
 static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 struct btrfs_extent_inline_ref *iref,
 				 u64 parent, u64 root_objectid,
@@ -999,7 +1014,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	btrfs_extend_item(path, size);
+	btrfs_extend_item(trans, path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1033,7 +1048,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 	} else {
 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1066,7 +1081,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
 /*
  * helper to update/remove inline back ref
  */
-static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
+static noinline_for_stack int update_inline_extent_backref(
+				  struct btrfs_trans_handle *trans,
+				  struct btrfs_path *path,
 				  struct btrfs_extent_inline_ref *iref,
 				  int refs_to_mod,
 				  struct btrfs_delayed_extent_op *extent_op)
@@ -1174,9 +1191,9 @@ static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *pa
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		btrfs_truncate_item(path, item_size, 1);
+		btrfs_truncate_item(trans, path, item_size, 1);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	return 0;
 }
 
@@ -1206,9 +1223,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 				   bytenr, num_bytes, root_objectid, path->slots[0]);
 			return -EUCLEAN;
 		}
-		ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+		ret = update_inline_extent_backref(trans, path, iref,
+						   refs_to_add, extent_op);
 	} else if (ret == -ENOENT) {
-		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
+		setup_inline_extent_backref(trans, path, iref, parent,
 					    root_objectid, owner, offset,
 					    refs_to_add, extent_op);
 		ret = 0;
@@ -1226,7 +1244,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!is_data && refs_to_drop != 1);
 	if (iref)
-		ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+		ret = update_inline_extent_backref(trans, path, iref,
+						   -refs_to_drop, NULL);
 	else if (is_data)
 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
 	else
@@ -1422,7 +1441,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
 	       generic_ref->action);
 	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
-	       generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+	       generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
 
 	if (generic_ref->type == BTRFS_REF_METADATA)
 		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1435,7 +1454,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 }
 
 /*
- * __btrfs_inc_extent_ref - insert backreference for a given extent
+ * Insert backreference for a given extent.
  *
  * The counterpart is in __btrfs_free_extent(), with examples and more details
  * how it works.
@@ -1465,8 +1484,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
  *		    always passed as 0. For data extents it is the fileoffset
  *		    this extent belongs to.
  *
- * @refs_to_add     Number of references to add
- *
  * @extent_op       Pointer to a structure, holding information necessary when
  *                  updating a tree block's flags
  *
@@ -1474,7 +1491,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_delayed_ref_node *node,
 				  u64 parent, u64 root_objectid,
-				  u64 owner, u64 offset, int refs_to_add,
+				  u64 owner, u64 offset,
 				  struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_path *path;
@@ -1484,6 +1501,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u64 bytenr = node->bytenr;
 	u64 num_bytes = node->num_bytes;
 	u64 refs;
+	int refs_to_add = node->ref_mod;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1510,7 +1528,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/* now insert the actual backref */
@@ -1529,45 +1547,72 @@ out:
 	return ret;
 }
 
+static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
+				     struct btrfs_delayed_ref_head *href)
+{
+	u64 root = href->owning_root;
+
+	/*
+	 * Don't check must_insert_reserved, as this is called from contexts
+	 * where it has already been unset.
+	 */
+	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
+	    !href->is_data || !is_fstree(root))
+		return;
+
+	btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
+				  BTRFS_QGROUP_RSV_DATA);
+}
+
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_delayed_ref_head *href,
 				struct btrfs_delayed_ref_node *node,
 				struct btrfs_delayed_extent_op *extent_op,
 				bool insert_reserved)
 {
 	int ret = 0;
 	struct btrfs_delayed_data_ref *ref;
-	struct btrfs_key ins;
 	u64 parent = 0;
-	u64 ref_root = 0;
 	u64 flags = 0;
 
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
 	ref = btrfs_delayed_node_to_data_ref(node);
 	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
 
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
-	ref_root = ref->root;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		struct btrfs_key key;
+		struct btrfs_squota_delta delta = {
+			.root = href->owning_root,
+			.num_bytes = node->num_bytes,
+			.is_data = true,
+			.is_inc	= true,
+			.generation = trans->transid,
+		};
+
 		if (extent_op)
 			flags |= extent_op->flags_to_set;
-		ret = alloc_reserved_file_extent(trans, parent, ref_root,
+
+		key.objectid = node->bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+
+		ret = alloc_reserved_file_extent(trans, parent, ref->root,
 						 flags, ref->objectid,
-						 ref->offset, &ins,
-						 node->ref_mod);
+						 ref->offset, &key,
+						 node->ref_mod, href->owning_root);
+		free_head_ref_squota_rsv(trans->fs_info, href);
+		if (!ret)
+			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+		ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
 					     ref->objectid, ref->offset,
-					     node->ref_mod, extent_op);
+					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, node, parent,
-					  ref_root, ref->objectid,
-					  ref->offset, node->ref_mod,
-					  extent_op);
+		ret = __btrfs_free_extent(trans, href, node, parent,
+					  ref->root, ref->objectid,
+					  ref->offset, extent_op);
 	} else {
 		BUG();
 	}
@@ -1604,7 +1649,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	u32 item_size;
 	int ret;
-	int err = 0;
 	int metadata = 1;
 
 	if (TRANS_ABORTED(trans))
@@ -1631,10 +1675,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		err = ret;
 		goto out;
-	}
-	if (ret > 0) {
+	} else if (ret > 0) {
 		if (metadata) {
 			if (path->slots[0] > 0) {
 				path->slots[0]--;
@@ -1655,7 +1697,7 @@ again:
 				goto again;
 			}
 		} else {
-			err = -EUCLEAN;
+			ret = -EUCLEAN;
 			btrfs_err(fs_info,
 		  "missing extent item for extent %llu num_bytes %llu level %d",
 				  head->bytenr, head->num_bytes, extent_op->level);
@@ -1667,29 +1709,31 @@ again:
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 
 	if (unlikely(item_size < sizeof(*ei))) {
-		err = -EUCLEAN;
+		ret = -EUCLEAN;
 		btrfs_err(fs_info,
 			  "unexpected extent item size, has %u expect >= %zu",
 			  item_size, sizeof(*ei));
-		btrfs_abort_transaction(trans, err);
+		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	__run_delayed_extent_op(extent_op, leaf, ei);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_delayed_ref_head *href,
 				struct btrfs_delayed_ref_node *node,
 				struct btrfs_delayed_extent_op *extent_op,
 				bool insert_reserved)
 {
 	int ret = 0;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_tree_ref *ref;
 	u64 parent = 0;
 	u64 ref_root = 0;
@@ -1709,14 +1753,24 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		return -EUCLEAN;
 	}
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		struct btrfs_squota_delta delta = {
+			.root = href->owning_root,
+			.num_bytes = fs_info->nodesize,
+			.is_data = false,
+			.is_inc = true,
+			.generation = trans->transid,
+		};
+
 		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, node, extent_op);
+		if (!ret)
+			btrfs_record_squota_delta(fs_info, &delta);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
-					     ref->level, 0, 1, extent_op);
+					     ref->level, 0, extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, node, parent, ref_root,
-					  ref->level, 0, 1, extent_op);
+		ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
+					  ref->level, 0, extent_op);
 	} else {
 		BUG();
 	}
@@ -1725,6 +1779,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node,
 			       struct btrfs_delayed_extent_op *extent_op,
 			       bool insert_reserved)
@@ -1732,19 +1787,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 	int ret = 0;
 
 	if (TRANS_ABORTED(trans)) {
-		if (insert_reserved)
+		if (insert_reserved) {
 			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+			free_head_ref_squota_rsv(trans->fs_info, href);
+		}
 		return 0;
 	}
 
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		ret = run_delayed_tree_ref(trans, node, extent_op,
+		ret = run_delayed_tree_ref(trans, href, node, extent_op,
 					   insert_reserved);
 	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		 node->type == BTRFS_SHARED_DATA_REF_KEY)
-		ret = run_delayed_data_ref(trans, node, extent_op,
+		ret = run_delayed_data_ref(trans, href, node, extent_op,
 					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
+		ret = 0;
 	else
 		BUG();
 	if (ret && insert_reserved)
@@ -1823,28 +1882,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
 	return ret ? ret : 1;
 }
 
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_delayed_ref_head *head)
 {
-	int nr_items = 1;	/* Dropping this ref head update. */
+	u64 ret = 0;
 
 	/*
 	 * We had csum deletions accounted for in our delayed refs rsv, we need
 	 * to drop the csum leaves for this update from our delayed_refs_rsv.
 	 */
 	if (head->total_ref_mod < 0 && head->is_data) {
+		int nr_csums;
+
 		spin_lock(&delayed_refs->lock);
 		delayed_refs->pending_csums -= head->num_bytes;
 		spin_unlock(&delayed_refs->lock);
-		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+		nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+
+		btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
+
+		ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
 	}
+	/* must_insert_reserved can be set only if we didn't run the head ref. */
+	if (head->must_insert_reserved)
+		free_head_ref_squota_rsv(fs_info, head);
 
-	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+	return ret;
 }
 
 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
-			    struct btrfs_delayed_ref_head *head)
+			    struct btrfs_delayed_ref_head *head,
+			    u64 *bytes_released)
 {
 
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1889,7 +1958,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+	*bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 
 	trace_run_delayed_ref_head(fs_info, head, 0);
 	btrfs_delayed_ref_unlock(head);
@@ -1931,7 +2000,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 }
 
 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
-					   struct btrfs_delayed_ref_head *locked_ref)
+					   struct btrfs_delayed_ref_head *locked_ref,
+					   u64 *bytes_released)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -1979,14 +2049,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 		 * spin lock.
 		 */
 		must_insert_reserved = locked_ref->must_insert_reserved;
+		/*
+		 * Unsetting this on the head ref relinquishes ownership of
+		 * the rsv_bytes, so it is critical that every possible code
+		 * path from here forward frees all reserves including qgroup
+		 * reserve.
+		 */
 		locked_ref->must_insert_reserved = false;
 
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 		spin_unlock(&locked_ref->lock);
 
-		ret = run_one_delayed_ref(trans, ref, extent_op,
+		ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
 					  must_insert_reserved);
+		btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+		*bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
 
 		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
@@ -2010,15 +2088,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-					     unsigned long nr)
+					     u64 min_bytes)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	int ret;
 	unsigned long count = 0;
+	unsigned long max_count = 0;
+	u64 bytes_processed = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
+	if (min_bytes == 0) {
+		max_count = delayed_refs->num_heads_ready;
+		min_bytes = U64_MAX;
+	}
+
 	do {
 		if (!locked_ref) {
 			locked_ref = btrfs_obtain_ref_head(trans);
@@ -2046,7 +2131,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		spin_lock(&locked_ref->lock);
 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
 
-		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
 		if (ret < 0 && ret != -EAGAIN) {
 			/*
 			 * Error, btrfs_run_delayed_refs_for_head already
@@ -2058,7 +2143,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			 * Success, perform the usual cleanup of a processed
 			 * head
 			 */
-			ret = cleanup_ref_head(trans, locked_ref);
+			ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
 			if (ret > 0 ) {
 				/* We dropped our lock, we need to loop. */
 				ret = 0;
@@ -2075,7 +2160,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 		locked_ref = NULL;
 		cond_resched();
-	} while ((nr != -1 && count < nr) || locked_ref);
+	} while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
+		 (max_count > 0 && count < max_count) ||
+		 locked_ref);
 
 	return 0;
 }
@@ -2124,24 +2211,25 @@ static u64 find_middle(struct rb_root *root)
 #endif
 
 /*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far.  count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
+ * Start processing the delayed reference count updates and extent insertions
+ * we have queued up so far.
+ *
+ * @trans:	Transaction handle.
+ * @min_bytes:	How many bytes of delayed references to process. After this
+ *		many bytes we stop processing delayed references if there are
+ *		any more. If 0 it means to run all existing delayed references,
+ *		but not new ones added after running all existing ones.
+ *		Use (u64)-1 (U64_MAX) to run all existing delayed references
+ *		plus any new ones that are added.
  *
  * Returns 0 on success or if called with an aborted transaction
  * Returns <0 on error and aborts the transaction
  */
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   unsigned long count)
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_head *head;
 	int ret;
-	int run_all = count == (unsigned long)-1;
 
 	/* We'll clean this up in btrfs_cleanup_transaction */
 	if (TRANS_ABORTED(trans))
@@ -2151,42 +2239,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		return 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	if (count == 0)
-		count = delayed_refs->num_heads_ready;
-
 again:
 #ifdef SCRAMBLE_DELAYED_REFS
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-	ret = __btrfs_run_delayed_refs(trans, count);
+	ret = __btrfs_run_delayed_refs(trans, min_bytes);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
 	}
 
-	if (run_all) {
+	if (min_bytes == U64_MAX) {
 		btrfs_create_pending_block_groups(trans);
 
 		spin_lock(&delayed_refs->lock);
-		node = rb_first_cached(&delayed_refs->href_root);
-		if (!node) {
+		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
 			spin_unlock(&delayed_refs->lock);
-			goto out;
+			return 0;
 		}
-		head = rb_entry(node, struct btrfs_delayed_ref_head,
-				href_node);
-		refcount_inc(&head->refs);
 		spin_unlock(&delayed_refs->lock);
 
-		/* Mutex was contended, block until it's released and retry. */
-		mutex_lock(&head->mutex);
-		mutex_unlock(&head->mutex);
-
-		btrfs_put_delayed_ref_head(head);
 		cond_resched();
 		goto again;
 	}
-out:
+
 	return 0;
 }
 
@@ -2311,6 +2387,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 	u32 item_size;
+	u32 expected_size;
 	int type;
 	int ret;
 
@@ -2337,10 +2414,22 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	ret = 1;
 	item_size = btrfs_item_size(leaf, path->slots[0]);
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
+
+	/* No inline refs; we need to bail before checking for owner ref. */
+	if (item_size == sizeof(*ei))
+		goto out;
+
+	/* Check for an owner ref; skip over it to the real inline refs. */
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+	if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+		iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+	}
 
 	/* If extent item has more than 1 inline ref then it's shared */
-	if (item_size != sizeof(*ei) +
-	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+	if (item_size != expected_size)
 		goto out;
 
 	/*
@@ -2352,8 +2441,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
 	     btrfs_root_last_snapshot(&root->root_item)))
 		goto out;
 
-	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-
 	/* If this extent has SHARED_DATA_REF then it's shared */
 	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 	if (type != BTRFS_EXTENT_DATA_REF_KEY)
@@ -2450,7 +2537,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			btrfs_init_generic_ref(&generic_ref, action, bytenr,
-					       num_bytes, parent);
+					       num_bytes, parent, ref_root);
 			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
 					    key.offset, root->root_key.objectid,
 					    for_reloc);
@@ -2463,8 +2550,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = fs_info->nodesize;
+			/* We don't know the owning_root, use 0. */
 			btrfs_init_generic_ref(&generic_ref, action, bytenr,
-					       num_bytes, parent);
+					       num_bytes, parent, 0);
 			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
 					    root->root_key.objectid, for_reloc);
 			if (inc)
@@ -2565,16 +2653,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * this function must be called within transaction
- */
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    u64 bytenr, u64 num_bytes)
+				    const struct extent_buffer *eb)
 {
 	struct btrfs_block_group *cache;
 	int ret;
 
-	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
 	if (!cache)
 		return -EINVAL;
 
@@ -2586,10 +2671,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+	pin_down_extent(trans, cache, eb->start, eb->len, 0);
 
 	/* remove us from the free space cache (if we're there at all) */
-	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+	ret = btrfs_remove_free_space(cache, eb->start, eb->len);
 out:
 	btrfs_put_block_group(cache);
 	return ret;
@@ -2844,12 +2929,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 	return 0;
 }
 
+/*
+ * Parse an extent item's inline extents looking for a simple quotas owner ref.
+ *
+ * @fs_info:	the btrfs_fs_info for this mount
+ * @leaf:	a leaf in the extent tree containing the extent item
+ * @slot:	the slot in the leaf where the extent item is found
+ *
+ * Returns the objectid of the root that originally allocated the extent item
+ * if the inline owner ref is expected and present, otherwise 0.
+ *
+ * If an extent item has an owner ref item, it will be the first inline ref
+ * item. Therefore the logic is to check whether there are any inline ref
+ * items, then check the type of the first one.
+ */
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+				struct extent_buffer *leaf, int slot)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_owner_ref *oref;
+	unsigned long ptr;
+	unsigned long end;
+	int type;
+
+	if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
+		return 0;
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + btrfs_item_size(leaf, slot);
+
+	/* No inline ref items of any kind, can't check type. */
+	if (ptr == end)
+		return 0;
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+
+	/* We found an owner ref, get the root out of it. */
+	if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+		return btrfs_extent_owner_ref_root_id(leaf, oref);
+	}
+
+	/* We have inline refs, but not an owner ref. */
+	return 0;
+}
+
 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
-				     u64 bytenr, u64 num_bytes, bool is_data)
+				     u64 bytenr, struct btrfs_squota_delta *delta)
 {
 	int ret;
+	u64 num_bytes = delta->num_bytes;
 
-	if (is_data) {
+	if (delta->is_data) {
 		struct btrfs_root *csum_root;
 
 		csum_root = btrfs_csum_root(trans->fs_info, bytenr);
@@ -2858,6 +2992,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
 			btrfs_abort_transaction(trans, ret);
 			return ret;
 		}
+
+		ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			return ret;
+		}
+	}
+
+	ret = btrfs_record_squota_delta(trans->fs_info, delta);
+	if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		return ret;
 	}
 
 	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
@@ -2940,9 +3086,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_ref_head *href,
 			       struct btrfs_delayed_ref_node *node, u64 parent,
 			       u64 root_objectid, u64 owner_objectid,
-			       u64 owner_offset, int refs_to_drop,
+			       u64 owner_offset,
 			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
@@ -2957,11 +3104,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
+	int refs_to_drop = node->ref_mod;
 	u32 item_size;
 	u64 refs;
 	u64 bytenr = node->bytenr;
 	u64 num_bytes = node->num_bytes;
 	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+	u64 delayed_ref_root = href->owning_root;
 
 	extent_root = btrfs_extent_root(info, bytenr);
 	ASSERT(extent_root);
@@ -3151,7 +3300,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		} else {
 			btrfs_set_extent_refs(leaf, ei, refs);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 		if (found_extent) {
 			ret = remove_extent_backref(trans, extent_root, path,
@@ -3162,6 +3311,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 	} else {
+		struct btrfs_squota_delta delta = {
+			.root = delayed_ref_root,
+			.num_bytes = num_bytes,
+			.is_data = is_data,
+			.is_inc = false,
+			.generation = btrfs_extent_generation(leaf, ei),
+		};
+
 		/* In this branch refs == 1 */
 		if (found_extent) {
 			if (is_data && refs_to_drop !=
@@ -3200,6 +3357,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				num_to_del = 2;
 			}
 		}
+		/*
+		 * We can't infer the data owner from the delayed ref, so we need
+		 * to try to get it from the owning ref item.
+		 *
+		 * If it is not present, then that extent was not written under
+		 * simple quotas mode, so we don't need to account for its deletion.
+		 */
+		if (is_data)
+			delta.root = btrfs_get_extent_owner_root(trans->fs_info,
+								 leaf, extent_slot);
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
@@ -3209,7 +3376,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		}
 		btrfs_release_path(path);
 
-		ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+		ret = do_free_extent_accounting(trans, bytenr, &delta);
 	}
 	btrfs_release_path(path);
 
@@ -3283,7 +3450,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
-			       buf->start, buf->len, parent);
+			       buf->start, buf->len, parent, btrfs_header_owner(buf));
 	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
 			    root_id, 0, false);
 
@@ -3370,10 +3537,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 	 * tree, just update pinning info and exit early.
 	 */
 	if ((ref->type == BTRFS_REF_METADATA &&
-	     ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+	     ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
 	    (ref->type == BTRFS_REF_DATA &&
-	     ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
-		/* unlocks the pinned mutex */
+	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
 		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
 		ret = 0;
 	} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3383,9 +3549,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 	}
 
 	if (!((ref->type == BTRFS_REF_METADATA &&
-	       ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+	       ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
 	      (ref->type == BTRFS_REF_DATA &&
-	       ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
 		btrfs_ref_tree_mod(fs_info, ref);
 
 	return ret;
@@ -4442,8 +4608,8 @@ loop:
 }
 
 /*
- * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
- *			  hole that is at least as big as @num_bytes.
+ * Entry point to the extent allocator. Tries to find a hole that is at least
+ * as big as @num_bytes.
  *
  * @root           -	The root that will contain this extent
  *
@@ -4562,20 +4728,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
-			      u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+			      const struct extent_buffer *eb)
 {
 	struct btrfs_block_group *cache;
 	int ret = 0;
 
-	cache = btrfs_lookup_block_group(trans->fs_info, start);
+	cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
 	if (!cache) {
 		btrfs_err(trans->fs_info, "unable to find block group for %llu",
-			  start);
+			  eb->start);
 		return -ENOSPC;
 	}
 
-	ret = pin_down_extent(trans, cache, start, len, 1);
+	ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
 	btrfs_put_block_group(cache);
 	return ret;
 }
@@ -4605,24 +4771,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      u64 parent, u64 root_objectid,
 				      u64 flags, u64 owner, u64 offset,
-				      struct btrfs_key *ins, int ref_mod)
+				      struct btrfs_key *ins, int ref_mod, u64 oref_root)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *extent_root;
 	int ret;
 	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_owner_ref *oref;
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	int type;
 	u32 size;
+	const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
 
 	if (parent > 0)
 		type = BTRFS_SHARED_DATA_REF_KEY;
 	else
 		type = BTRFS_EXTENT_DATA_REF_KEY;
 
-	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+	size = sizeof(*extent_item);
+	if (simple_quota)
+		size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+	size += btrfs_extent_inline_ref_size(type);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4644,7 +4815,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 			       flags | BTRFS_EXTENT_FLAG_DATA);
 
 	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	if (simple_quota) {
+		btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
+		oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+		btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
+		iref = (struct btrfs_extent_inline_ref *)(oref + 1);
+	}
 	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+
 	if (parent > 0) {
 		struct btrfs_shared_data_ref *ref;
 		ref = (struct btrfs_shared_data_ref *)(iref + 1);
@@ -4659,7 +4837,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
 	}
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4734,7 +4912,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
 	}
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -4746,12 +4924,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_key *ins)
 {
 	struct btrfs_ref generic_ref = { 0 };
+	u64 root_objectid = root->root_key.objectid;
+	u64 owning_root = root_objectid;
+
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+	if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+		owning_root = root->relocation_src_root;
 
 	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
-			       ins->objectid, ins->offset, 0);
-	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+			       ins->objectid, ins->offset, 0, owning_root);
+	btrfs_init_data_ref(&generic_ref, root_objectid, owner,
 			    offset, 0, false);
 	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
 
@@ -4771,6 +4954,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group *block_group;
 	struct btrfs_space_info *space_info;
+	struct btrfs_squota_delta delta = {
+		.root = root_objectid,
+		.num_bytes = ins->offset,
+		.generation = trans->transid,
+		.is_data = true,
+		.is_inc = true,
+	};
 
 	/*
 	 * Mixed block groups will exclude before processing the log so we only
@@ -4796,13 +4986,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 	spin_unlock(&space_info->lock);
 
 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
-					 offset, ins, 1);
+					 offset, ins, 1, root_objectid);
 	if (ret)
 		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+	ret = btrfs_record_squota_delta(fs_info, &delta);
 	btrfs_put_block_group(block_group);
 	return ret;
 }
 
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+	if (eb->lock_owner == current->pid) {
+		btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+			     eb->start, btrfs_header_owner(eb), current->pid);
+		return true;
+	}
+	return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+	return false;
+}
+#endif
+
 static struct extent_buffer *
 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      u64 bytenr, int level, u64 owner,
@@ -4816,15 +5029,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (IS_ERR(buf))
 		return buf;
 
-	/*
-	 * Extra safety check in case the extent tree is corrupted and extent
-	 * allocator chooses to use a tree block which is already used and
-	 * locked.
-	 */
-	if (buf->lock_owner == current->pid) {
-		btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
-			buf->start, btrfs_header_owner(buf), current->pid);
+	if (check_eb_lock_owner(buf)) {
 		free_extent_buffer(buf);
 		return ERR_PTR(-EUCLEAN);
 	}
@@ -4901,6 +5106,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     const struct btrfs_disk_key *key,
 					     int level, u64 hint,
 					     u64 empty_size,
+					     u64 reloc_src_root,
 					     enum btrfs_lock_nesting nest)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4913,6 +5119,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 blocksize = fs_info->nodesize;
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+	u64 owning_root;
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 	if (btrfs_is_testing(fs_info)) {
@@ -4939,11 +5146,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		ret = PTR_ERR(buf);
 		goto out_free_reserved;
 	}
+	owning_root = btrfs_header_owner(buf);
 
 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		if (parent == 0)
 			parent = ins.objectid;
 		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		owning_root = reloc_src_root;
 	} else
 		BUG_ON(parent > 0);
 
@@ -4963,7 +5172,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		extent_op->level = level;
 
 		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
-				       ins.objectid, ins.offset, parent);
+				       ins.objectid, ins.offset, parent, owning_root);
 		btrfs_init_tree_ref(&generic_ref, level, root_objectid,
 				    root->root_key.objectid, false);
 		btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -5051,7 +5260,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 		/* We don't lock the tree block, it's OK to be racy here */
 		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
 					       wc->level - 1, 1, &refs,
-					       &flags);
+					       &flags, NULL);
 		/* We don't care about errors in readahead. */
 		if (ret < 0)
 			continue;
@@ -5118,7 +5327,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 		ret = btrfs_lookup_extent_info(trans, fs_info,
 					       eb->start, level, 1,
 					       &wc->refs[level],
-					       &wc->flags[level]);
+					       &wc->flags[level],
+					       NULL);
 		BUG_ON(ret == -ENOMEM);
 		if (ret)
 			return ret;
@@ -5208,6 +5418,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 generation;
 	u64 parent;
+	u64 owner_root = 0;
 	struct btrfs_tree_parent_check check = { 0 };
 	struct btrfs_key key;
 	struct btrfs_ref ref = { 0 };
@@ -5251,7 +5462,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
-				       &wc->flags[level - 1]);
+				       &wc->flags[level - 1],
+				       &owner_root);
 	if (ret < 0)
 		goto out_unlock;
 
@@ -5384,7 +5596,7 @@ skip:
 		find_next_key(path, level, &wc->drop_progress);
 
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-				       fs_info->nodesize, parent);
+				       fs_info->nodesize, parent, owner_root);
 		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
 				    0, false);
 		ret = btrfs_free_extent(trans, &ref);
@@ -5451,7 +5663,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			ret = btrfs_lookup_extent_info(trans, fs_info,
 						       eb->start, level, 1,
 						       &wc->refs[level],
-						       &wc->flags[level]);
+						       &wc->flags[level],
+						       NULL);
 			if (ret < 0) {
 				btrfs_tree_unlock_rw(eb, path->locks[level]);
 				path->locks[level] = 0;
@@ -5696,7 +5909,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 			ret = btrfs_lookup_extent_info(trans, fs_info,
 						path->nodes[level]->start,
 						level, 1, &wc->refs[level],
-						&wc->flags[level]);
+						&wc->flags[level], NULL);
 			if (ret < 0) {
 				err = ret;
 				goto out_end_trans;
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 88c249c37516..2e066035ccee 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -7,6 +7,7 @@
 #include "block-group.h"
 
 struct btrfs_free_cluster;
+struct btrfs_delayed_ref_head;
 
 enum btrfs_extent_allocation_policy {
 	BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -91,18 +92,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 				     enum btrfs_inline_ref_type is_data);
 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
 
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_delayed_ref_head *head);
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
-			     u64 offset, int metadata, u64 *refs, u64 *flags);
+			     u64 offset, int metadata, u64 *refs, u64 *flags,
+			     u64 *owner_root);
 int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
 		     int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    u64 bytenr, u64 num_bytes);
+				    const struct extent_buffer *eb);
 int btrfs_exclude_logged_extents(struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr, bool strict,
@@ -113,6 +115,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     const struct btrfs_disk_key *key,
 					     int level, u64 hint,
 					     u64 empty_size,
+					     u64 reloc_src_root,
 					     enum btrfs_lock_nesting nest);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   u64 root_id,
@@ -136,12 +139,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct extent_buffer *eb, u64 flags);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
 
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+				struct extent_buffer *leaf, int slot);
 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 			       u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+			      const struct extent_buffer *eb);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
 				     int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index caccd0376342..8f724c54fc8e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,7 +21,6 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "bio.h"
-#include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
 #include "backref.h"
@@ -395,7 +394,7 @@ again:
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
-			     EXTENT_DELALLOC, 1, cached_state);
+			     EXTENT_DELALLOC, cached_state);
 	if (!ret) {
 		unlock_extent(tree, delalloc_start, delalloc_end,
 			      &cached_state);
@@ -675,8 +674,8 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
  * 		the array will be skipped
  *
  * Return: 0        if all pages were able to be allocated;
- *         -ENOMEM  otherwise, and the caller is responsible for freeing all
- *                  non-null page pointers in the array.
+ *         -ENOMEM  otherwise, the partially allocated pages would be freed and
+ *                  the array slots zeroed
  */
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 {
@@ -695,8 +694,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 		 * though alloc_pages_bulk_array() falls back to alloc_page()
 		 * if  it could not bulk-allocate. So we must be out of memory.
 		 */
-		if (allocated == last)
+		if (allocated == last) {
+			for (int i = 0; i < allocated; i++) {
+				__free_page(page_array[i]);
+				page_array[i] = NULL;
+			}
 			return -ENOMEM;
+		}
 
 		memalloc_retry_wait(GFP_NOFS);
 	}
@@ -2294,11 +2298,12 @@ static int try_release_extent_state(struct extent_io_tree *tree,
 	u64 end = start + PAGE_SIZE - 1;
 	int ret = 1;
 
-	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+	if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
 		ret = 0;
 	} else {
 		u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
-				   EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+				   EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
+				   EXTENT_QGROUP_RESERVED);
 
 		/*
 		 * At this point we can safely clear everything except the
@@ -2353,9 +2358,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 				free_extent_map(em);
 				break;
 			}
-			if (test_range_bit(tree, em->start,
-					   extent_map_end(em) - 1,
-					   EXTENT_LOCKED, 0, NULL))
+			if (test_range_bit_exists(tree, em->start,
+						  extent_map_end(em) - 1,
+						  EXTENT_LOCKED))
 				goto next;
 			/*
 			 * If it's not in the list of modified extents, used
@@ -3455,6 +3460,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
 			  start, fs_info->nodesize);
 		return -EINVAL;
 	}
+	if (!IS_ALIGNED(start, fs_info->nodesize) &&
+	    !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+		btrfs_warn(fs_info,
+"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
+			      start, fs_info->nodesize);
+	}
 	return 0;
 }
 
@@ -4248,14 +4259,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
 }
 
 /*
- * eb_bitmap_offset() - calculate the page and offset of the byte containing the
- * given bit number
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains the
- * given bit number
- * @page_offset: return offset into the page given by page_index
+ * Calculate the page and offset of the byte containing the given bit number.
+ *
+ * @eb:           the extent buffer
+ * @start:        offset of the bitmap item in the extent buffer
+ * @nr:           bit number
+ * @page_index:   return index of the page in the extent buffer that contains
+ *                the given bit number
+ * @page_offset:  return offset into the page given by page_index
  *
  * This helper hides the ugliness of finding the byte in an extent buffer which
  * contains a given bit.
@@ -4614,7 +4625,8 @@ int try_release_extent_buffer(struct page *page)
 }
 
 /*
- * btrfs_readahead_tree_block - attempt to readahead a child block
+ * Attempt to readahead a child block.
+ *
  * @fs_info:	the fs_info
  * @bytenr:	bytenr to read
  * @owner_root: objectid of the root that owns this eb
@@ -4653,7 +4665,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_readahead_node_child - readahead a node's child block
+ * Readahead a node's child block.
+ *
  * @node:	parent node we're reading from
  * @slot:	slot in the parent node for the child we want to read
  *
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..2171057a4477 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -80,16 +80,16 @@ struct extent_buffer {
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
-	struct rcu_head rcu_head;
-	pid_t lock_owner;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
 
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+	pid_t lock_owner;
 #endif
 };
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ce5dd154499..45cae356e89b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_encryption(leaf, item, 0);
 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -811,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
  * This calls btrfs_truncate_item with the correct args based on the overlap,
  * and fixes up the key as required.
  */
-static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
 				       struct btrfs_path *path,
 				       struct btrfs_key *key,
 				       u64 bytenr, u64 len)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct extent_buffer *leaf;
 	const u32 csum_size = fs_info->csum_size;
 	u64 csum_end;
@@ -836,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		btrfs_truncate_item(path, new_size, 1);
+		btrfs_truncate_item(trans, path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -848,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		btrfs_truncate_item(path, new_size, 0);
+		btrfs_truncate_item(trans, path, new_size, 0);
 
 		key->offset = end_byte;
-		btrfs_set_item_key_safe(fs_info, path, key);
+		btrfs_set_item_key_safe(trans, path, key);
 	} else {
 		BUG();
 	}
@@ -994,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 			key.offset = end_byte - 1;
 		} else {
-			truncate_one_csum(fs_info, path, &key, bytenr, len);
+			truncate_one_csum(trans, path, &key, bytenr, len);
 			if (key.offset < bytenr)
 				break;
 		}
@@ -1202,7 +1203,7 @@ extend_csum:
 		diff /= csum_size;
 		diff *= csum_size;
 
-		btrfs_extend_item(path, diff);
+		btrfs_extend_item(trans, path, diff);
 		ret = 0;
 		goto csum;
 	}
@@ -1249,7 +1250,7 @@ found:
 	ins_size /= csum_size;
 	total_bytes += ins_size * fs_info->sectorsize;
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	if (total_bytes < sums->len) {
 		btrfs_release_path(path);
 		cond_resched();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 361535c71c0f..32611a4edd6b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,6 +17,7 @@
 #include <linux/uio.h>
 #include <linux/iversion.h>
 #include <linux/fsverity.h>
+#include <linux/iomap.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -368,12 +369,13 @@ next_slot:
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->start);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 
 			if (update_refs && disk_bytenr > 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_ADD_DELAYED_REF,
-						disk_bytenr, num_bytes, 0);
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						new_key.objectid,
@@ -405,13 +407,13 @@ next_slot:
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = args->end;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 
 			extent_offset += args->end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - args->end);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += args->end - key.offset;
 			break;
@@ -431,7 +433,7 @@ next_slot:
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							args->start - key.offset);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			if (update_refs && disk_bytenr > 0)
 				args->bytes_found += extent_end - args->start;
 			if (args->end == extent_end)
@@ -463,7 +465,8 @@ delete_extent_item:
 			} else if (update_refs && disk_bytenr > 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_DROP_DELAYED_REF,
-						disk_bytenr, num_bytes, 0);
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						key.objectid,
@@ -536,7 +539,8 @@ delete_extent_item:
 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 				path->slots[0]++;
 		}
-		btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+		btrfs_setup_item_for_insert(trans, root, path, &key,
+					    args->extent_item_size);
 		args->extent_inserted = true;
 	}
 
@@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct btrfs_inode *inode, u64 start, u64 end)
 {
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = inode->root;
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
@@ -664,7 +667,7 @@ again:
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
@@ -679,7 +682,7 @@ again:
 							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -698,7 +701,7 @@ again:
 							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(fs_info, path, &new_key);
+			btrfs_set_item_key_safe(trans, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
@@ -708,7 +711,7 @@ again:
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     start - orig_offset);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 			goto out;
 		}
 	}
@@ -742,10 +745,10 @@ again:
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
-				       num_bytes, 0);
+				       num_bytes, 0, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
 				    orig_offset, 0, false);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -771,7 +774,7 @@ again:
 	other_start = end;
 	other_end = 0;
 	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-			       num_bytes, 0);
+			       num_bytes, 0, root->root_key.objectid);
 	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
 			    0, false);
 	if (extent_mergeable(leaf, path->slots[0] + 1,
@@ -814,7 +817,7 @@ again:
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
@@ -823,7 +826,7 @@ again:
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret < 0) {
@@ -1108,17 +1111,18 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
 
 static void update_time_for_write(struct inode *inode)
 {
-	struct timespec64 now, ctime;
+	struct timespec64 now, ts;
 
 	if (IS_NOCMTIME(inode))
 		return;
 
 	now = current_time(inode);
-	if (!timespec64_equal(&inode->i_mtime, &now))
-		inode->i_mtime = now;
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(&ts, &now))
+		inode_set_mtime_to_ts(inode, now);
 
-	ctime = inode_get_ctime(inode);
-	if (!timespec64_equal(&ctime, &now))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(&ts, &now))
 		inode_set_ctime_to_ts(inode, now);
 
 	if (IS_I_VERSION(inode))
@@ -1746,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	if (btrfs_inode_in_log(inode, fs_info->generation) &&
+	if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
 	    list_empty(&ctx->ordered_extents))
 		return true;
 
@@ -1757,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 	 * and for a fast fsync we don't wait for that, we only wait for the
 	 * writeback to complete.
 	 */
-	if (inode->last_trans <= fs_info->last_trans_committed &&
+	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
 	     list_empty(&ctx->ordered_extents)))
 		return true;
@@ -1886,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	atomic_inc(&root->log_batch);
 
-	smp_mb();
 	if (skip_inode_logging(&ctx)) {
 		/*
 		 * We've had everything committed since the last time we were
@@ -2104,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 
@@ -2112,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		u64 num_bytes;
 
 		key.offset = offset;
-		btrfs_set_item_key_safe(fs_info, path, &key);
+		btrfs_set_item_key_safe(trans, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2121,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 		btrfs_set_file_extent_offset(leaf, fi, 0);
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		goto out;
 	}
 	btrfs_release_path(path);
@@ -2273,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
 	if (extent_info->is_new_extent)
 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
@@ -2303,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
 				       extent_info->disk_offset,
-				       extent_info->disk_len, 0);
+				       extent_info->disk_len, 0,
+				       root->root_key.objectid);
 		ref_offset = extent_info->file_offset - extent_info->data_offset;
 		btrfs_init_data_ref(&ref, root->root_key.objectid,
 				    btrfs_ino(inode), ref_offset, 0, false);
@@ -2473,9 +2477,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
 		inode_inc_iversion(&inode->vfs_inode);
 
 		if (!extent_info || extent_info->update_times)
-			inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
+			inode_set_mtime_to_ts(&inode->vfs_inode,
+					      inode_set_ctime_current(&inode->vfs_inode));
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 		if (ret)
 			break;
 
@@ -2714,8 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
 
 	ASSERT(trans != NULL);
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	updated_inode = true;
 	btrfs_end_transaction(trans);
 	btrfs_btree_balance_dirty(fs_info);
@@ -2734,14 +2739,14 @@ out_only_mutex:
 		struct timespec64 now = inode_set_ctime_current(inode);
 
 		inode_inc_iversion(inode);
-		inode->i_mtime = now;
+		inode_set_mtime_to_ts(inode, now);
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
 		} else {
 			int ret2;
 
-			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(inode));
 			ret2 = btrfs_end_transaction(trans);
 			if (!ret)
 				ret = ret2;
@@ -2808,7 +2813,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
 	inode_set_ctime_current(inode);
 	i_size_write(inode, end);
 	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	ret2 = btrfs_end_transaction(trans);
 
 	return ret ? ret : ret2;
@@ -3187,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 			qgroup_reserved -= range->len;
 		} else if (qgroup_reserved > 0) {
 			btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
-					       range->start, range->len);
+					       range->start, range->len, NULL);
 			qgroup_reserved -= range->len;
 		}
 		list_del(&range->list);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 27fad70451aa..6f93c9a2c3e3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 			      struct btrfs_free_space *info, u64 offset,
 			      u64 bytes, bool update_stats);
 
+static void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+	put_unaligned_le32(~crc, result);
+}
+
 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
@@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 	btrfs_set_inode_block_group(leaf, inode_item, offset);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
 				struct btrfs_free_space_header);
 	memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 	btrfs_set_free_space_key(leaf, header, &disk_key);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
@@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 
 fail:
 	if (locked)
@@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 	btrfs_crc32c_final(crc, (u8 *)&crc);
 	io_ctl_unmap_page(io_ctl);
 	tmp = page_address(io_ctl->pages[0]);
@@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 	val = *tmp;
 
 	io_ctl_map_page(io_ctl, 0);
-	crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+	crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 	btrfs_crc32c_final(crc, (u8 *)&crc);
 	if (val != crc) {
 		btrfs_err_rl(io_ctl->fs_info,
@@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
 	btrfs_set_free_space_entries(leaf, header, entries);
 	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
 	btrfs_set_free_space_generation(leaf, header, trans->transid);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	return 0;
@@ -1321,7 +1326,7 @@ out:
 	  "failed to write free space cache for block group %llu error %d",
 				  block_group->start, ret);
 	}
-	btrfs_update_inode(trans, root, BTRFS_I(inode));
+	btrfs_update_inode(trans, BTRFS_I(inode));
 
 	if (block_group) {
 		/* the dirty list is protected by the dirty_bgs_lock */
@@ -1362,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
 /*
  * Write out cached info to an inode.
  *
- * @root:        root the inode belongs to
  * @inode:       freespace inode we are writing out
  * @ctl:         free space cache we are going to write out
  * @block_group: block_group for this cache if it belongs to a block_group
@@ -1373,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
  * on mount.  This will return 0 if it was successful in writing the cache out,
  * or an errno if it was not.
  */
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct inode *inode,
 				   struct btrfs_free_space_ctl *ctl,
 				   struct btrfs_block_group *block_group,
 				   struct btrfs_io_ctl *io_ctl,
@@ -1506,7 +1510,7 @@ out:
 		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
-	btrfs_update_inode(trans, root, BTRFS_I(inode));
+	btrfs_update_inode(trans, BTRFS_I(inode));
 	if (must_iput)
 		iput(inode);
 	return ret;
@@ -1532,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
 	if (IS_ERR(inode))
 		return 0;
 
-	ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
-				block_group, &block_group->io_ctl, trans);
+	ret = __btrfs_write_out_cache(inode, ctl, block_group,
+				      &block_group->io_ctl, trans);
 	if (ret) {
 		btrfs_debug(fs_info,
 	  "failed to write free space cache for block group %llu error %d",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index c0e734082dcc..7b598b070700 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
 			      struct btrfs_free_space_info);
 	btrfs_set_free_space_extent_count(leaf, info, 0);
 	btrfs_set_free_space_flags(leaf, info, 0);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 	flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	if (extent_count != expected_extent_count) {
@@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		write_extent_buffer(leaf, bitmap_cursor, ptr,
 				    data_size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 		btrfs_release_path(path);
 
 		i += extent_size;
@@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
 	flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
 	btrfs_set_free_space_flags(leaf, info, flags);
 	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 
 	extent_count += new_extents;
 	btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
@@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
 	return !!extent_buffer_test_bit(leaf, ptr, i);
 }
 
-static void free_space_set_bits(struct btrfs_block_group *block_group,
+static void free_space_set_bits(struct btrfs_trans_handle *trans,
+				struct btrfs_block_group *block_group,
 				struct btrfs_path *path, u64 *start, u64 *size,
 				int bit)
 {
@@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
 		extent_buffer_bitmap_set(leaf, ptr, first, last - first);
 	else
 		extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	*size -= end - *start;
 	*start = end;
@@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
 	cur_start = start;
 	cur_size = size;
 	while (1) {
-		free_space_set_bits(block_group, path, &cur_start, &cur_size,
+		free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
 				    !remove);
 		if (cur_size == 0)
 			break;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a523d64d5491..318df6f9d9cb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -139,6 +139,12 @@ enum {
 	 */
 	BTRFS_FS_FEATURE_CHANGED,
 
+	/*
+	 * Indicate that we have found a tree block which is only aligned to
+	 * sectorsize, but not to nodesize.  This should be rare nowadays.
+	 */
+	BTRFS_FS_UNALIGNED_TREE_BLOCK,
+
 #if BITS_PER_LONG == 32
 	/* Indicate if we have error/warn message printed on 32bit systems */
 	BTRFS_FS_32BIT_ERROR,
@@ -171,19 +177,17 @@ enum {
 	BTRFS_MOUNT_AUTO_DEFRAG			= (1UL << 16),
 	BTRFS_MOUNT_USEBACKUPROOT		= (1UL << 17),
 	BTRFS_MOUNT_SKIP_BALANCE		= (1UL << 18),
-	BTRFS_MOUNT_CHECK_INTEGRITY		= (1UL << 19),
-	BTRFS_MOUNT_CHECK_INTEGRITY_DATA	= (1UL << 20),
-	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 21),
-	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 22),
-	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 23),
-	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 24),
-	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 25),
-	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 26),
-	BTRFS_MOUNT_REF_VERIFY			= (1UL << 27),
-	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 28),
-	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 29),
-	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 30),
-	BTRFS_MOUNT_NODISCARD			= (1UL << 31),
+	BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	= (1UL << 19),
+	BTRFS_MOUNT_RESCAN_UUID_TREE		= (1UL << 20),
+	BTRFS_MOUNT_FRAGMENT_DATA		= (1UL << 21),
+	BTRFS_MOUNT_FRAGMENT_METADATA		= (1UL << 22),
+	BTRFS_MOUNT_FREE_SPACE_TREE		= (1UL << 23),
+	BTRFS_MOUNT_NOLOGREPLAY			= (1UL << 24),
+	BTRFS_MOUNT_REF_VERIFY			= (1UL << 25),
+	BTRFS_MOUNT_DISCARD_ASYNC		= (1UL << 26),
+	BTRFS_MOUNT_IGNOREBADROOTS		= (1UL << 27),
+	BTRFS_MOUNT_IGNOREDATACSUMS		= (1UL << 28),
+	BTRFS_MOUNT_NODISCARD			= (1UL << 29),
 };
 
 /*
@@ -216,7 +220,8 @@ enum {
 	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
 	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
 	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
-	 BTRFS_FEATURE_INCOMPAT_ZONED)
+	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
+	 BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
 
 #ifdef CONFIG_BTRFS_DEBUG
 	/*
@@ -225,6 +230,7 @@ enum {
 	 */
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
 	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
+	 BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
 	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
 
 #else
@@ -369,6 +375,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *data_reloc_root;
 	struct btrfs_root *block_group_root;
+	struct btrfs_root *stripe_root;
 
 	/* The log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -409,7 +416,17 @@ struct btrfs_fs_info {
 
 	struct btrfs_block_rsv empty_block_rsv;
 
+	/*
+	 * Updated while holding the lock 'trans_lock'. Due to the life cycle of
+	 * a transaction, it can be directly read while holding a transaction
+	 * handle, everywhere else must be read with btrfs_get_fs_generation().
+	 * Should always be updated using btrfs_set_fs_generation().
+	 */
 	u64 generation;
+	/*
+	 * Always use btrfs_get_last_trans_committed() and
+	 * btrfs_set_last_trans_committed() to read and update this field.
+	 */
 	u64 last_trans_committed;
 	/*
 	 * Generation of the last transaction used for block group relocation
@@ -645,9 +662,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_discard_ctl discard_ctl;
 
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	u32 check_integrity_print_mask;
-#endif
 	/* Is qgroup tracking in a consistent state? */
 	u64 qgroup_flags;
 
@@ -683,6 +697,7 @@ struct btrfs_fs_info {
 	/* Protected by qgroup_rescan_lock */
 	bool qgroup_rescan_running;
 	u8 qgroup_drop_subtree_thres;
+	u64 qgroup_enable_gen;
 
 	/*
 	 * If this is not 0, then it indicates a serious filesystem error has
@@ -812,6 +827,26 @@ struct btrfs_fs_info {
 #endif
 };
 
+static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
+{
+	return READ_ONCE(fs_info->generation);
+}
+
+static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
+{
+	WRITE_ONCE(fs_info->generation, gen);
+}
+
+static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
+{
+	return READ_ONCE(fs_info->last_trans_committed);
+}
+
+static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
+{
+	WRITE_ONCE(fs_info->last_trans_committed, gen);
+}
+
 static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
 						u64 gen)
 {
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4c322b720a80..7d734830e514 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	memmove_extent_buffer(leaf, ptr, ptr + del_len,
 			      item_size - (ptr + del_len - item_start));
 
-	btrfs_truncate_item(path, item_size - del_len, 1);
+	btrfs_truncate_item(trans, path, item_size - del_len, 1);
 
 out:
 	btrfs_free_path(path);
@@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(path, item_size - sub_item_len, 1);
+	btrfs_truncate_item(trans, path, item_size - sub_item_len, 1);
 out:
 	btrfs_free_path(path);
 
@@ -247,7 +247,7 @@ out:
 }
 
 /*
- * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ * Insert an extended inode ref into a tree.
  *
  * The caller must have checked against BTRFS_LINK_MAX already.
  */
@@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 						   name))
 			goto out;
 
-		btrfs_extend_item(path, ins_len);
+		btrfs_extend_item(trans, path, ins_len);
 		ret = 0;
 	}
 	if (ret < 0)
@@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)&extref->name;
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 out:
 	btrfs_free_path(path);
@@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			goto out;
 
 		old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
-		btrfs_extend_item(path, ins_len);
+		btrfs_extend_item(trans, path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ptr = (unsigned long)(ref + 1);
 	}
 	write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 out:
 	btrfs_free_path(path);
@@ -591,7 +591,7 @@ search_again:
 				num_dec = (orig_num_bytes - extent_num_bytes);
 				if (extent_start != 0)
 					control->sub_bytes += num_dec;
-				btrfs_mark_buffer_dirty(leaf);
+				btrfs_mark_buffer_dirty(trans, leaf);
 			} else {
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -617,7 +617,7 @@ search_again:
 
 				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
 				size = btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(path, size, 1);
+				btrfs_truncate_item(trans, path, size, 1);
 			} else if (!del_item) {
 				/*
 				 * We have to bail so the last_size is set to
@@ -676,7 +676,8 @@ delete:
 			bytes_deleted += extent_num_bytes;
 
 			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
-					extent_start, extent_num_bytes, 0);
+					extent_start, extent_num_bytes, 0,
+					root->root_key.objectid);
 			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 					control->ino, extent_offset,
 					root->root_key.objectid, false);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index ede43b6c6559..4337bb26f419 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -4,6 +4,7 @@
 #define BTRFS_INODE_ITEM_H
 
 #include <linux/types.h>
+#include <linux/crc32c.h>
 
 struct btrfs_trans_handle;
 struct btrfs_root;
@@ -12,6 +13,7 @@ struct btrfs_key;
 struct btrfs_inode_extref;
 struct btrfs_inode;
 struct extent_buffer;
+struct fscrypt_str;
 
 /*
  * Return this if we need to call truncate_block for the last bit of the
@@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 	*ro_flags = (u32)(inode_item_flags >> 32);
 }
 
+/* Figure the key offset of an extended inode ref. */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len)
+{
+       return (u64)crc32c(parent_objectid, name, len);
+}
+
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_truncate_control *control);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7814b9d654ce..fb3c3f43c3fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@
 #include "super.h"
 #include "orphan.h"
 #include "backref.h"
+#include "raid-stripe-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
 }
 
 /*
- * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ * Lock inode i_rwsem based on arguments passed.
  *
  * ilock_flags can have the following bit set:
  *
@@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
 }
 
 /*
- * btrfs_inode_unlock - unock inode i_rwsem
+ * Unock inode i_rwsem.
  *
  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  * to decide whether the lock acquired is shared or exclusive.
@@ -573,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		kunmap_local(kaddr);
 		put_page(page);
 	}
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 	}
 
 	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -687,7 +688,7 @@ out:
 	 * And at reserve time, it's always aligned to page size, so
 	 * just free one page here.
 	 */
-	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
+	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans);
 	return ret;
@@ -1565,8 +1566,11 @@ out_unlock:
  * Phase two of compressed writeback.  This is the ordered portion of the code,
  * which only gets called in the order the work was queued.  We walk all the
  * async extents created by compress_file_range and send them down to the disk.
+ *
+ * If called with @do_free == true then it'll try to finish the work and free
+ * the work struct eventually.
  */
-static noinline void submit_compressed_extents(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
 {
 	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
 						     work);
@@ -1575,6 +1579,21 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
 	unsigned long nr_pages;
 	u64 alloc_hint = 0;
 
+	if (do_free) {
+		struct async_chunk *async_chunk;
+		struct async_cow *async_cow;
+
+		async_chunk = container_of(work, struct async_chunk, work);
+		btrfs_add_delayed_iput(async_chunk->inode);
+		if (async_chunk->blkcg_css)
+			css_put(async_chunk->blkcg_css);
+
+		async_cow = async_chunk->async_cow;
+		if (atomic_dec_and_test(&async_cow->num_chunks))
+			kvfree(async_cow);
+		return;
+	}
+
 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 		PAGE_SHIFT;
 
@@ -1591,21 +1610,6 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
 		cond_wake_up_nomb(&fs_info->async_submit_wait);
 }
 
-static noinline void async_cow_free(struct btrfs_work *work)
-{
-	struct async_chunk *async_chunk;
-	struct async_cow *async_cow;
-
-	async_chunk = container_of(work, struct async_chunk, work);
-	btrfs_add_delayed_iput(async_chunk->inode);
-	if (async_chunk->blkcg_css)
-		css_put(async_chunk->blkcg_css);
-
-	async_cow = async_chunk->async_cow;
-	if (atomic_dec_and_test(&async_cow->num_chunks))
-		kvfree(async_cow);
-}
-
 static bool run_delalloc_compressed(struct btrfs_inode *inode,
 				    struct page *locked_page, u64 start,
 				    u64 end, struct writeback_control *wbc)
@@ -1683,7 +1687,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
 		}
 
 		btrfs_init_work(&async_chunk[i].work, compress_file_range,
-				submit_compressed_extents, async_cow_free);
+				submit_compressed_extents);
 
 		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -2235,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 {
 	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
 		if (inode->defrag_bytes &&
-		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
-				   0, NULL))
+		    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 			return false;
 		return true;
 	}
@@ -2847,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
 	ihold(inode);
 	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
 	get_page(page);
-	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 	fixup->page = page;
 	fixup->inode = BTRFS_I(inode);
 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2912,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 			btrfs_item_ptr_offset(leaf, path->slots[0]),
 			sizeof(struct btrfs_file_extent_item));
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_release_path(path);
 
 	/*
@@ -3070,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 			goto out;
 		}
 		trans->block_rsv = &inode->block_rsv;
-		ret = btrfs_update_inode_fallback(trans, root, inode);
+		ret = btrfs_update_inode_fallback(trans, inode);
 		if (ret) /* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3091,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 
 	trans->block_rsv = &inode->block_rsv;
 
+	ret = btrfs_insert_raid_extent(trans, ordered_extent);
+	if (ret)
+		goto out;
+
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compress_type = ordered_extent->compress_type;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3136,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 				 &cached_state);
 
 	btrfs_inode_safe_disk_i_size_write(inode, 0);
-	ret = btrfs_update_inode_fallback(trans, root, inode);
+	ret = btrfs_update_inode_fallback(trans, inode);
 	if (ret) { /* -ENOMEM or corruption */
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -3224,7 +3231,8 @@ out:
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 {
 	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
-	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+	    list_empty(&ordered->bioc_list))
 		btrfs_finish_ordered_zoned(ordered);
 	return btrfs_finish_one_ordered(ordered);
 }
@@ -3282,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 
 	if (btrfs_is_data_reloc_root(inode->root) &&
 	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
-			   1, NULL)) {
+			   NULL)) {
 		/* Skip the range without csum for data reloc inode */
 		clear_extent_bits(&inode->io_tree, file_offset, end,
 				  EXTENT_NODATASUM);
@@ -3306,7 +3314,7 @@ zeroit:
 }
 
 /*
- * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ * Perform a delayed iput on @inode.
  *
  * @inode: The inode we want to perform iput on
  *
@@ -3754,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode,
 	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
 			round_up(i_size_read(inode), fs_info->sectorsize));
 
-	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+	inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+			btrfs_timespec_nsec(leaf, &inode_item->atime));
 
-	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+	inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+			btrfs_timespec_nsec(leaf, &inode_item->mtime));
 
 	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
 			btrfs_timespec_nsec(leaf, &inode_item->ctime));
 
-	BTRFS_I(inode)->i_otime.tv_sec =
-		btrfs_timespec_sec(leaf, &inode_item->otime);
-	BTRFS_I(inode)->i_otime.tv_nsec =
-		btrfs_timespec_nsec(leaf, &inode_item->otime);
+	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3792,7 +3798,7 @@ cache_index:
 	 * This is required for both inode re-read from disk and delayed inode
 	 * in delayed_nodes_tree.
 	 */
-	if (BTRFS_I(inode)->last_trans == fs_info->generation)
+	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			&BTRFS_I(inode)->runtime_flags);
 
@@ -3922,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 
 	btrfs_set_token_timespec_sec(&token, &item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
-	btrfs_set_token_timespec_sec(&token, &item->otime,
-				     BTRFS_I(inode)->i_otime.tv_sec);
-	btrfs_set_token_timespec_nsec(&token, &item->otime,
-				      BTRFS_I(inode)->i_otime.tv_nsec);
+	btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
+	btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
 
 	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
 	btrfs_set_token_inode_generation(&token, item,
@@ -3957,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
  * copy everything in the in-memory inode into the btree.
  */
 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_inode *inode)
+					    struct btrfs_inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
@@ -3969,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+	ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -3981,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 				    struct btrfs_inode_item);
 
 	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
@@ -3992,10 +3995,10 @@ failed:
 /*
  * copy everything in the in-memory inode into the btree.
  */
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_inode *inode)
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_inode *inode)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
@@ -4011,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
 		btrfs_update_root_times(trans, root);
 
-		ret = btrfs_delayed_update_inode(trans, root, inode);
+		ret = btrfs_delayed_update_inode(trans, inode);
 		if (!ret)
 			btrfs_set_inode_last_trans(trans, inode);
 		return ret;
 	}
 
-	return btrfs_update_inode_item(trans, root, inode);
+	return btrfs_update_inode_item(trans, inode);
 }
 
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct btrfs_inode *inode)
+				struct btrfs_inode *inode)
 {
 	int ret;
 
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret == -ENOSPC)
-		return btrfs_update_inode_item(trans, root, inode);
+		return btrfs_update_inode_item(trans, inode);
 	return ret;
 }
 
@@ -4132,9 +4135,8 @@ err:
 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 	inode_inc_iversion(&inode->vfs_inode);
 	inode_inc_iversion(&dir->vfs_inode);
-	inode_set_ctime_current(&inode->vfs_inode);
-	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
-	ret = btrfs_update_inode(trans, root, dir);
+ 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+	ret = btrfs_update_inode(trans, dir);
 out:
 	return ret;
 }
@@ -4148,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 	if (!ret) {
 		drop_nlink(&inode->vfs_inode);
-		ret = btrfs_update_inode(trans, inode->root, inode);
+		ret = btrfs_update_inode(trans, inode);
 	}
 	return ret;
 }
@@ -4306,8 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 
 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
 	inode_inc_iversion(&dir->vfs_inode);
-	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
-	ret = btrfs_update_inode_fallback(trans, root, dir);
+	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+	ret = btrfs_update_inode_fallback(trans, dir);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 out:
@@ -4641,7 +4643,8 @@ out_notrans:
 }
 
 /*
- * btrfs_truncate_block - read, zero a chunk and write a block
+ * Read, zero a chunk and write a block.
+ *
  * @inode - inode that we're zeroing
  * @from - the offset to start zeroing
  * @len - the length to zero, 0 to zero the entire range respective to the
@@ -4791,9 +4794,9 @@ out:
 	return ret;
 }
 
-static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
-			     u64 offset, u64 len)
+static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_drop_extents_args drop_args = { 0 };
@@ -4833,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
 		btrfs_abort_transaction(trans, ret);
 	} else {
 		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
-		btrfs_update_inode(trans, root, inode);
+		btrfs_update_inode(trans, inode);
 	}
 	btrfs_end_transaction(trans);
 	return ret;
@@ -4889,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
 
-			err = maybe_insert_hole(root, inode, cur_offset,
-						hole_size);
+			err = maybe_insert_hole(inode, cur_offset, hole_size);
 			if (err)
 				break;
 
@@ -4916,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 			hole_em->orig_block_len = 0;
 			hole_em->ram_bytes = hole_size;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
-			hole_em->generation = fs_info->generation;
+			hole_em->generation = btrfs_get_fs_generation(fs_info);
 
 			err = btrfs_replace_extent_map_range(inode, hole_em, true);
 			free_extent_map(hole_em);
@@ -4956,7 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	if (newsize != oldsize) {
 		inode_inc_iversion(inode);
 		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 		}
 	}
 
@@ -4984,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		i_size_write(inode, newsize);
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 		pagecache_isize_extended(inode, oldsize, newsize);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		btrfs_drew_write_unlock(&root->snapshot_lock);
 		btrfs_end_transaction(trans);
 	} else {
@@ -5129,7 +5132,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
 		 */
 		if (state_flags & EXTENT_DELALLOC)
 			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
-					       end - start + 1);
+					       end - start + 1, NULL);
 
 		clear_extent_bit(io_tree, start, end,
 				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
@@ -5582,6 +5585,7 @@ static struct inode *new_simple_dir(struct inode *dir,
 				    struct btrfs_key *key,
 				    struct btrfs_root *root)
 {
+	struct timespec64 ts;
 	struct inode *inode = new_inode(dir->i_sb);
 
 	if (!inode)
@@ -5600,9 +5604,13 @@ static struct inode *new_simple_dir(struct inode *dir,
 	inode->i_opflags &= ~IOP_XATTR;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-	inode->i_mtime = inode_set_ctime_current(inode);
-	inode->i_atime = dir->i_atime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+	ts = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, ts);
+	inode_set_atime_to_ts(inode, inode_get_atime(dir));
+	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+
 	inode->i_uid = dir->i_uid;
 	inode->i_gid = dir->i_gid;
 
@@ -6000,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+	ret = btrfs_update_inode(trans, inode);
+	if (ret == -ENOSPC || ret == -EDQUOT) {
 		/* whoops, lets try again with the full transaction */
 		btrfs_end_transaction(trans);
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 	}
 	btrfs_end_transaction(trans);
 	if (inode->delayed_node)
@@ -6024,7 +6032,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
 static int btrfs_update_time(struct inode *inode, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	bool dirty = flags & ~S_VERSION;
+	bool dirty;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -6160,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 			   struct btrfs_new_inode_args *args)
 {
+	struct timespec64 ts;
 	struct inode *dir = args->dir;
 	struct inode *inode = args->inode;
 	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
@@ -6277,9 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		goto discard;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
-	inode->i_atime = inode->i_mtime;
-	BTRFS_I(inode)->i_otime = inode->i_mtime;
+	ts = simple_inode_init_ts(inode);
+	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 
 	/*
 	 * We're going to fill the inode item now, so at this point the inode
@@ -6310,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	/*
 	 * We don't need the path anymore, plus inheriting properties, adding
 	 * ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6444,10 +6453,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	 * values (the ones it had when the fsync was done).
 	 */
 	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
-		parent_inode->vfs_inode.i_mtime =
-			inode_set_ctime_current(&parent_inode->vfs_inode);
+		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+				      inode_set_ctime_current(&parent_inode->vfs_inode));
 
-	ret = btrfs_update_inode(trans, root, parent_inode);
+	ret = btrfs_update_inode(trans, parent_inode);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 	return ret;
@@ -6598,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	} else {
 		struct dentry *parent = dentry->d_parent;
 
-		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		err = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (err)
 			goto fail;
 		if (inode->i_nlink == 1) {
@@ -6974,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 	int ret;
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 				   0, alloc_hint, &ins, 1, 1);
+	if (ret == -EAGAIN) {
+		ASSERT(btrfs_is_zoned(fs_info));
+		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+			       TASK_UNINTERRUPTIBLE);
+		goto again;
+	}
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -7103,8 +7119,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 		range_end = round_up(offset + nocow_args.num_bytes,
 				     root->fs_info->sectorsize) - 1;
-		ret = test_range_bit(io_tree, offset, range_end,
-				     EXTENT_DELALLOC, 0, NULL);
+		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
 		if (ret) {
 			ret = -EAGAIN;
 			goto out;
@@ -8005,11 +8020,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 					 EXTENT_DEFRAG, &cached_state);
 
-		spin_lock_irq(&inode->ordered_tree.lock);
+		spin_lock_irq(&inode->ordered_tree_lock);
 		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
 		ordered->truncated_len = min(ordered->truncated_len,
 					     cur - ordered->file_offset);
-		spin_unlock_irq(&inode->ordered_tree.lock);
+		spin_unlock_irq(&inode->ordered_tree_lock);
 
 		/*
 		 * If the ordered extent has finished, we're safe to delete all
@@ -8044,7 +8059,7 @@ next:
 		 *    reserved data space.
 		 *    Since the IO will never happen for this page.
 		 */
-		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
+		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
 		if (!inode_evicting) {
 			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
 				 EXTENT_DELALLOC | EXTENT_UPTODATE |
@@ -8339,7 +8354,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 		if (ret != -ENOSPC && ret != -EAGAIN)
 			break;
 
-		ret = btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, inode);
 		if (ret)
 			break;
 
@@ -8392,7 +8407,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 		int ret2;
 
 		trans->block_rsv = &fs_info->trans_block_rsv;
-		ret2 = btrfs_update_inode(trans, root, inode);
+		ret2 = btrfs_update_inode(trans, inode);
 		if (ret2 && !ret)
 			ret = ret2;
 
@@ -8481,8 +8496,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	ei->delayed_node = NULL;
 
-	ei->i_otime.tv_sec = 0;
-	ei->i_otime.tv_nsec = 0;
+	ei->i_otime_sec = 0;
+	ei->i_otime_nsec = 0;
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
@@ -8491,7 +8506,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
 			    IO_TREE_INODE_FILE_EXTENT);
 	mutex_init(&ei->log_mutex);
-	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	spin_lock_init(&ei->ordered_tree_lock);
+	ei->ordered_tree = RB_ROOT;
+	ei->ordered_tree_last = NULL;
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->delayed_iput);
 	RB_CLEAR_NODE(&ei->rb_node);
@@ -8634,8 +8651,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
 	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 
 	stat->result_mask |= STATX_BTIME;
-	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
-	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
+	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
 	if (bi_flags & BTRFS_INODE_APPEND)
 		stat->attributes |= STATX_ATTR_APPEND;
 	if (bi_flags & BTRFS_INODE_COMPRESS)
@@ -8823,7 +8840,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 					   BTRFS_I(old_dentry->d_inode),
 					   old_name, &old_rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -8838,7 +8855,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
 					   BTRFS_I(new_dentry->d_inode),
 					   new_name, &new_rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -9083,7 +9100,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
 					   BTRFS_I(d_inode(old_dentry)),
 					   &old_fname.disk_name, &rename_ctx);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 	}
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -9208,7 +9225,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
 	init_completion(&work->completion);
 	INIT_LIST_HEAD(&work->list);
 	work->inode = inode;
-	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
 
 	return work;
 }
@@ -9446,7 +9463,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 	btrfs_free_path(path);
 
 	d_instantiate_new(dentry, inode);
@@ -9474,7 +9491,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	struct btrfs_path *path;
 	u64 start = ins->objectid;
 	u64 len = ins->offset;
-	int qgroup_released;
+	u64 qgroup_released = 0;
 	int ret;
 
 	memset(&stack_fi, 0, sizeof(stack_fi));
@@ -9487,9 +9504,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
 	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
 	/* Encryption and other encoding is reserved and all 0 */
 
-	qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
-	if (qgroup_released < 0)
-		return ERR_PTR(qgroup_released);
+	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
+	if (ret < 0)
+		return ERR_PTR(ret);
 
 	if (trans) {
 		ret = insert_reserved_file_extent(trans, inode,
@@ -9639,7 +9656,7 @@ next:
 			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 		}
 
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 
 		if (ret) {
 			btrfs_abort_transaction(trans, ret);
@@ -10384,7 +10401,7 @@ out_delalloc_release:
 	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
 out_qgroup_free_data:
 	if (ret < 0)
-		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
 out_free_data_space:
 	/*
 	 * If btrfs_reserve_extent() succeeded, then we already decremented
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8e7d03bc1b56..a1743904202b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -385,7 +385,7 @@ update_flags:
 	btrfs_sync_inode_flags_to_i_flags(inode);
 	inode_inc_iversion(inode);
 	inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 
  out_end_trans:
 	btrfs_end_transaction(trans);
@@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
 	/* Tree log can't currently deal with an inode which is a new root. */
 	btrfs_set_log_full_commit(trans);
 
-	ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+	ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
 	if (ret)
 		goto out;
 
 	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
-				      BTRFS_NESTING_NORMAL);
+				      0, BTRFS_NESTING_NORMAL);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto out;
 	}
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	inode_item = &root_item->inode;
 	btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 			 * are limited to own subvolumes only
 			 */
 			ret = -EPERM;
+		} else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+			/*
+			 * Snapshots must be made with the src_inode referring
+			 * to the subvolume inode, otherwise the permission
+			 * checking above is useless because we may have
+			 * permission on a lower directory but not the subvol
+			 * itself.
+			 */
+			ret = -EINVAL;
 		} else {
 			ret = btrfs_mksnapshot(&file->f_path, idmap,
 					       name, namelen,
@@ -1528,7 +1537,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
 static noinline int copy_to_sk(struct btrfs_path *path,
 			       struct btrfs_key *key,
 			       struct btrfs_ioctl_search_key *sk,
-			       size_t *buf_size,
+			       u64 *buf_size,
 			       char __user *ubuf,
 			       unsigned long *sk_offset,
 			       int *num_found)
@@ -1660,7 +1669,7 @@ out:
 
 static noinline int search_ioctl(struct inode *inode,
 				 struct btrfs_ioctl_search_key *sk,
-				 size_t *buf_size,
+				 u64 *buf_size,
 				 char __user *ubuf)
 {
 	struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1742,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
 	struct btrfs_ioctl_search_args __user *uargs = argp;
 	struct btrfs_ioctl_search_key sk;
 	int ret;
-	size_t buf_size;
+	u64 buf_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1763,8 +1772,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
 	struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
 	struct btrfs_ioctl_search_args_v2 args;
 	int ret;
-	size_t buf_size;
-	const size_t buf_limit = SZ_16M;
+	u64 buf_size;
+	const u64 buf_limit = SZ_16M;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2635,6 +2644,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
 		return -EINVAL;
 	}
 
+	if (fs_info->fs_devices->temp_fsid) {
+		btrfs_err(fs_info,
+			  "device add not supported on cloned temp-fsid mount");
+		return -EINVAL;
+	}
+
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
 		if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
 			return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2676,8 +2691,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args_v2 *vol_args;
-	struct block_device *bdev = NULL;
-	void *holder;
+	struct bdev_handle *bdev_handle = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2714,7 +2728,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 		goto err_drop;
 
 	/* Exclusive operation is now claimed */
-	ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+	ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
 
 	btrfs_exclop_finish(fs_info);
 
@@ -2728,8 +2742,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	}
 err_drop:
 	mnt_drop_write_file(file);
-	if (bdev)
-		blkdev_put(bdev, holder);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2742,8 +2756,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct block_device *bdev = NULL;
-	void *holder;
+	struct bdev_handle *bdev_handle = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2770,15 +2783,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
 					   cancel);
 	if (ret == 0) {
-		ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+		ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
 		if (!ret)
 			btrfs_info(fs_info, "disk deleted %s", vol_args->name);
 		btrfs_exclop_finish(fs_info);
 	}
 
 	mnt_drop_write_file(file);
-	if (bdev)
-		blkdev_put(bdev, holder);
+	if (bdev_handle)
+		bdev_release(bdev_handle);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2822,7 +2835,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 	}
 
 	if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
-		fi_args->generation = fs_info->generation;
+		fi_args->generation = btrfs_get_fs_generation(fs_info);
 		fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
 	}
 
@@ -2947,7 +2960,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
 	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 
 	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -3131,7 +3144,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 			return PTR_ERR(trans);
 
 		/* No running transaction, don't bother */
-		transid = root->fs_info->last_trans_committed;
+		transid = btrfs_get_last_trans_committed(root->fs_info);
 		goto out;
 	}
 	transid = trans->transid;
@@ -3697,7 +3710,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 
 	switch (sa->cmd) {
 	case BTRFS_QUOTA_CTL_ENABLE:
-		ret = btrfs_quota_enable(fs_info);
+	case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+		ret = btrfs_quota_enable(fs_info, sa);
 		break;
 	case BTRFS_QUOTA_CTL_DISABLE:
 		ret = btrfs_quota_disable(fs_info);
@@ -4351,6 +4365,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat
 		arg->clone_sources = compat_ptr(args32.clone_sources);
 		arg->parent_root = args32.parent_root;
 		arg->flags = args32.flags;
+		arg->version = args32.version;
 		memcpy(arg->reserved, args32.reserved,
 		       sizeof(args32.reserved));
 #else
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7979449a58d6..74d8e2003f58 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
 #include <asm/bug.h>
+#include <trace/events/btrfs.h>
 #include "misc.h"
 #include "ctree.h"
 #include "extent_io.h"
@@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
 	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
 	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
 	{ .id = 0,				DEFINE_NAME("tree")	},
 };
 
@@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
 
 #endif
 
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+	eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
 /*
  * Extent buffer locking
  * =====================
@@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
 int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 {
 	if (down_write_trylock(&eb->lock)) {
-		eb->lock_owner = current->pid;
+		btrfs_set_eb_lock_owner(eb, current->pid);
 		trace_btrfs_try_tree_write_lock(eb);
 		return 1;
 	}
@@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
 }
 
 /*
- * __btrfs_tree_lock - lock eb for write
+ * Lock eb for write.
+ *
  * @eb:		the eb to lock
  * @nest:	the nesting to use for the lock
  *
@@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
 		start_ns = ktime_get_ns();
 
 	down_write_nested(&eb->lock, nest);
-	eb->lock_owner = current->pid;
+	btrfs_set_eb_lock_owner(eb, current->pid);
 	trace_btrfs_tree_lock(eb, start_ns);
 }
 
@@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
 void btrfs_tree_unlock(struct extent_buffer *eb)
 {
 	trace_btrfs_tree_unlock(eb);
-	eb->lock_owner = 0;
+	btrfs_set_eb_lock_owner(eb, 0);
 	up_write(&eb->lock);
 }
 
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 7695decc7243..b8f9c9e56c8c 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -72,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
  *        over the error.  Each subsequent error that doesn't have any context
  *        of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
  */
-const char * __attribute_const__ btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int error)
 {
 	char *errstr = "unknown";
 
-	switch (errno) {
+	switch (error) {
 	case -ENOENT:		/* -2 */
 		errstr = "No such entry";
 		break;
@@ -110,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno)
 }
 
 /*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
+ * Decodes expected errors from the caller and invokes the appropriate error
+ * response.
  */
 __cold
 void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		       unsigned int line, int errno, const char *fmt, ...)
+		       unsigned int line, int error, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
 #ifdef CONFIG_PRINTK
@@ -132,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * Special case: if the error is EROFS, and we're already under
 	 * SB_RDONLY, then it is safe here.
 	 */
-	if (errno == -EROFS && sb_rdonly(sb))
+	if (error == -EROFS && sb_rdonly(sb))
 		return;
 
 #ifdef CONFIG_PRINTK
-	errstr = btrfs_decode_error(errno);
+	errstr = btrfs_decode_error(error);
 	btrfs_state_to_string(fs_info, statestr);
 	if (fmt) {
 		struct va_format vaf;
@@ -147,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 		vaf.va = &args;
 
 		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
-			sb->s_id, statestr, function, line, errno, errstr, &vaf);
+			sb->s_id, statestr, function, line, error, errstr, &vaf);
 		va_end(args);
 	} else {
 		pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
-			sb->s_id, statestr, function, line, errno, errstr);
+			sb->s_id, statestr, function, line, error, errstr);
 	}
 #endif
 
@@ -159,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	 * Today we only save the error info to memory.  Long term we'll also
 	 * send it down to the disk.
 	 */
-	WRITE_ONCE(fs_info->fs_error, errno);
+	WRITE_ONCE(fs_info->fs_error, error);
 
 	/* Don't go through full error handling during mount. */
 	if (!(sb->s_flags & SB_BORN))
@@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
 #endif
 
 /*
- * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
- * alert, and either panics or BUGs, depending on mount options.
+ * Decode unexpected, fatal errors from the caller, issue an alert, and either
+ * panic or BUGs, depending on mount options.
  */
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...)
+		   unsigned int line, int error, const char *fmt, ...)
 {
 	char *s_id = "<unknown>";
 	const char *errstr;
@@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 	va_start(args, fmt);
 	vaf.va = &args;
 
-	errstr = btrfs_decode_error(errno);
+	errstr = btrfs_decode_error(error);
 	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
 		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
-			s_id, function, line, &vaf, errno, errstr);
+			s_id, function, line, &vaf, error, errstr);
 
 	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
-		   function, line, &vaf, errno, errstr);
+		   function, line, &vaf, error, errstr);
 	va_end(args);
 	/* Caller calls BUG() */
 }
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 1ae6f8e23e07..4d04c1fa5899 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -184,25 +184,25 @@ do {								\
 __printf(5, 6)
 __cold
 void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno, const char *fmt, ...);
+		     unsigned int line, int error, const char *fmt, ...);
 
-const char * __attribute_const__ btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int error);
 
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)		\
+#define btrfs_handle_fs_error(fs_info, error, fmt, args...)		\
 	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
-				(errno), fmt, ##args)
+				(error), fmt, ##args)
 
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
-		   unsigned int line, int errno, const char *fmt, ...);
+		   unsigned int line, int error, const char *fmt, ...);
 /*
  * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
  * will panic().  Otherwise we BUG() here.
  */
-#define btrfs_panic(fs_info, errno, fmt, args...)			\
+#define btrfs_panic(fs_info, error, fmt, args...)			\
 do {									\
-	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	__btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args);	\
 	BUG();								\
 } while (0)
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 345c449d588c..a82e1417c4d2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
  * look find the first ordered struct that has this offset, otherwise
  * the first one less than this offset
  */
-static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
-					  u64 file_offset)
+static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
+						  u64 file_offset)
 {
-	struct rb_root *root = &tree->tree;
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 	struct btrfs_ordered_extent *entry;
 
-	if (tree->last) {
-		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+	if (inode->ordered_tree_last) {
+		entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
 				 rb_node);
 		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
-			return tree->last;
+			return inode->ordered_tree_last;
 	}
-	ret = __tree_search(root, file_offset, &prev);
+	ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
 	if (!ret)
 		ret = prev;
 	if (ret)
-		tree->last = ret;
+		inode->ordered_tree_last = ret;
 	return ret;
 }
 
@@ -153,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 {
 	struct btrfs_ordered_extent *entry;
 	int ret;
+	u64 qgroup_rsv = 0;
 
 	if (flags &
 	    ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
 		/* For nocow write, we can release the qgroup rsv right now */
-		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
 		if (ret < 0)
 			return ERR_PTR(ret);
 	} else {
@@ -165,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 		 * The ordered extent has reserved qgroup space, release now
 		 * and pass the reserved number for qgroup_record to free.
 		 */
-		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
 		if (ret < 0)
 			return ERR_PTR(ret);
 	}
@@ -183,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 	entry->inode = igrab(&inode->vfs_inode);
 	entry->compress_type = compress_type;
 	entry->truncated_len = (u64)-1;
-	entry->qgroup_rsv = ret;
+	entry->qgroup_rsv = qgroup_rsv;
 	entry->flags = flags;
 	refcount_set(&entry->refs, 1);
 	init_waitqueue_head(&entry->wait);
@@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 	INIT_LIST_HEAD(&entry->log_list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
 	INIT_LIST_HEAD(&entry->work_list);
+	INIT_LIST_HEAD(&entry->bioc_list);
 	init_completion(&entry->completion);
 
 	/*
@@ -208,7 +209,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
 static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_inode *inode = BTRFS_I(entry->inode);
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *node;
@@ -221,13 +221,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
 	/* One ref for the tree. */
 	refcount_inc(&entry->refs);
 
-	spin_lock_irq(&tree->lock);
-	node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = tree_insert(&inode->ordered_tree, entry->file_offset,
+			   &entry->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 				"inconsistency in ordered tree at offset %llu",
 				entry->file_offset);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 
 	spin_lock(&root->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
@@ -287,12 +288,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
 			   struct btrfs_ordered_sum *sum)
 {
-	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_inode *inode = BTRFS_I(entry->inode);
 
-	tree = &BTRFS_I(entry->inode)->ordered_tree;
-	spin_lock_irq(&tree->lock);
+	spin_lock_irq(&inode->ordered_tree_lock);
 	list_add_tail(&sum->list, &entry->list);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 }
 
 static void finish_ordered_fn(struct btrfs_work *work)
@@ -310,7 +310,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
-	lockdep_assert_held(&inode->ordered_tree.lock);
+	lockdep_assert_held(&inode->ordered_tree_lock);
 
 	if (page) {
 		ASSERT(page->mapping);
@@ -364,7 +364,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
 	struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
 		fs_info->endio_freespace_worker : fs_info->endio_write_workers;
 
-	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
 	btrfs_queue_work(wq, &ordered->work);
 }
 
@@ -378,9 +378,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
 
 	trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
 
-	spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
-	spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 
 	if (ret)
 		btrfs_queue_ordered_fn(ordered);
@@ -404,7 +404,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 				    struct page *page, u64 file_offset,
 				    u64 num_bytes, bool uptodate)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
@@ -414,13 +413,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 					  file_offset + num_bytes - 1,
 					  uptodate);
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	while (cur < file_offset + num_bytes) {
 		u64 entry_end;
 		u64 end;
 		u32 len;
 
-		node = tree_search(tree, cur);
+		node = ordered_tree_search(inode, cur);
 		/* No ordered extents at all */
 		if (!node)
 			break;
@@ -467,13 +466,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
 		len = end + 1 - cur;
 
 		if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
-			spin_unlock_irqrestore(&tree->lock, flags);
+			spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 			btrfs_queue_ordered_fn(entry);
-			spin_lock_irqsave(&tree->lock, flags);
+			spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 		}
 		cur += len;
 	}
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 }
 
 /*
@@ -497,19 +496,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
 				    struct btrfs_ordered_extent **cached,
 				    u64 file_offset, u64 io_size)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
 	bool finished = false;
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
 	if (cached && *cached) {
 		entry = *cached;
 		goto have_entry;
 	}
 
-	node = tree_search(tree, file_offset);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -540,7 +538,7 @@ out:
 		refcount_inc(&entry->refs);
 		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
 	}
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 	return finished;
 }
 
@@ -578,7 +576,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
 				 struct btrfs_ordered_extent *entry)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = btrfs_inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *node;
@@ -603,22 +600,23 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
 			release = entry->disk_num_bytes;
 		else
 			release = entry->num_bytes;
-		btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+		btrfs_delalloc_release_metadata(btrfs_inode, release,
+						test_bit(BTRFS_ORDERED_IOERR,
+							 &entry->flags));
 	}
 
 	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
 				 fs_info->delalloc_batch);
 
-	tree = &btrfs_inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
+	spin_lock_irq(&btrfs_inode->ordered_tree_lock);
 	node = &entry->rb_node;
-	rb_erase(node, &tree->tree);
+	rb_erase(node, &btrfs_inode->ordered_tree);
 	RB_CLEAR_NODE(node);
-	if (tree->last == node)
-		tree->last = NULL;
+	if (btrfs_inode->ordered_tree_last == node)
+		btrfs_inode->ordered_tree_last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
 
 	/*
 	 * The current running transaction is waiting on us, we need to let it
@@ -711,7 +709,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
 		spin_unlock(&root->ordered_extent_lock);
 
 		btrfs_init_work(&ordered->flush_work,
-				btrfs_run_ordered_extent_work, NULL, NULL);
+				btrfs_run_ordered_extent_work, NULL);
 		list_add_tail(&ordered->work_list, &works);
 		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
 
@@ -875,14 +873,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
 							 u64 file_offset)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	unsigned long flags;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irqsave(&tree->lock, flags);
-	node = tree_search(tree, file_offset);
+	spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -894,7 +890,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
 		trace_btrfs_ordered_extent_lookup(inode, entry);
 	}
 out:
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
 	return entry;
 }
 
@@ -904,15 +900,13 @@ out:
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
 		struct btrfs_inode *inode, u64 file_offset, u64 len)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
-	node = tree_search(tree, file_offset);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node) {
-		node = tree_search(tree, file_offset + len);
+		node = ordered_tree_search(inode, file_offset + len);
 		if (!node)
 			goto out;
 	}
@@ -936,7 +930,7 @@ out:
 		refcount_inc(&entry->refs);
 		trace_btrfs_ordered_extent_lookup_range(inode, entry);
 	}
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -947,13 +941,12 @@ out:
 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 					   struct list_head *list)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *n;
 
 	ASSERT(inode_is_locked(&inode->vfs_inode));
 
-	spin_lock_irq(&tree->lock);
-	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+	spin_lock_irq(&inode->ordered_tree_lock);
+	for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
 		struct btrfs_ordered_extent *ordered;
 
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
@@ -966,7 +959,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 		refcount_inc(&ordered->refs);
 		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
 	}
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 }
 
 /*
@@ -976,13 +969,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
 {
-	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	tree = &inode->ordered_tree;
-	spin_lock_irq(&tree->lock);
-	node = tree_search(tree, file_offset);
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = ordered_tree_search(inode, file_offset);
 	if (!node)
 		goto out;
 
@@ -990,7 +981,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
 	refcount_inc(&entry->refs);
 	trace_btrfs_ordered_extent_lookup_first(inode, entry);
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -1006,15 +997,14 @@ out:
 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
 			struct btrfs_inode *inode, u64 file_offset, u64 len)
 {
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct rb_node *node;
 	struct rb_node *cur;
 	struct rb_node *prev;
 	struct rb_node *next;
 	struct btrfs_ordered_extent *entry = NULL;
 
-	spin_lock_irq(&tree->lock);
-	node = tree->tree.rb_node;
+	spin_lock_irq(&inode->ordered_tree_lock);
+	node = inode->ordered_tree.rb_node;
 	/*
 	 * Here we don't want to use tree_search() which will use tree->last
 	 * and screw up the search order.
@@ -1068,7 +1058,7 @@ out:
 		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
 	}
 
-	spin_unlock_irq(&tree->lock);
+	spin_unlock_irq(&inode->ordered_tree_lock);
 	return entry;
 }
 
@@ -1147,7 +1137,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 			struct btrfs_ordered_extent *ordered, u64 len)
 {
 	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
-	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 file_offset = ordered->file_offset;
@@ -1187,13 +1176,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	refcount_inc(&new->refs);
 
 	spin_lock_irq(&root->ordered_extent_lock);
-	spin_lock(&tree->lock);
+	spin_lock(&inode->ordered_tree_lock);
 	/* Remove from tree once */
 	node = &ordered->rb_node;
-	rb_erase(node, &tree->tree);
+	rb_erase(node, &inode->ordered_tree);
 	RB_CLEAR_NODE(node);
-	if (tree->last == node)
-		tree->last = NULL;
+	if (inode->ordered_tree_last == node)
+		inode->ordered_tree_last = NULL;
 
 	ordered->file_offset += len;
 	ordered->disk_bytenr += len;
@@ -1224,18 +1213,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
 	}
 
 	/* Re-insert the node */
-	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+	node = tree_insert(&inode->ordered_tree, ordered->file_offset,
+			   &ordered->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 			"zoned: inconsistency in ordered tree at offset %llu",
 			ordered->file_offset);
 
-	node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+	node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
 	if (node)
 		btrfs_panic(fs_info, -EEXIST,
 			"zoned: inconsistency in ordered tree at offset %llu",
 			new->file_offset);
-	spin_unlock(&tree->lock);
+	spin_unlock(&inode->ordered_tree_lock);
 
 	list_add_tail(&new->root_extent_list, &root->ordered_extents);
 	root->nr_ordered_extents++;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 173bd5c5df26..567a6d3d4712 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,13 +6,6 @@
 #ifndef BTRFS_ORDERED_DATA_H
 #define BTRFS_ORDERED_DATA_H
 
-/* one of these per inode */
-struct btrfs_ordered_inode_tree {
-	spinlock_t lock;
-	struct rb_root tree;
-	struct rb_node *last;
-};
-
 struct btrfs_ordered_sum {
 	/*
 	 * Logical start address and length for of the blocks covered by
@@ -151,15 +144,9 @@ struct btrfs_ordered_extent {
 	struct completion completion;
 	struct btrfs_work flush_work;
 	struct list_head work_list;
-};
 
-static inline void
-btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
-{
-	spin_lock_init(&t->lock);
-	t->tree = RB_ROOT;
-	t->last = NULL;
-}
+	struct list_head bioc_list;
+};
 
 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0c93439e929f..7e46aa8a0444 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,8 @@
 #include "print-tree.h"
 #include "accessors.h"
 #include "tree-checker.h"
+#include "volumes.h"
+#include "raid-stripe-tree.h"
 
 struct root_name_map {
 	u64 id;
@@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = {
 	{ BTRFS_FREE_SPACE_TREE_OBJECTID,	"FREE_SPACE_TREE"	},
 	{ BTRFS_BLOCK_GROUP_TREE_OBJECTID,	"BLOCK_GROUP_TREE"	},
 	{ BTRFS_DATA_RELOC_TREE_OBJECTID,	"DATA_RELOC_TREE"	},
+	{ BTRFS_RAID_STRIPE_TREE_OBJECTID,	"RAID_STRIPE_TREE"	},
 };
 
 const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb,
 	       btrfs_extent_data_ref_count(eb, ref));
 }
 
+static void print_extent_owner_ref(const struct extent_buffer *eb,
+				   const struct btrfs_extent_owner_ref *ref)
+{
+	ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA));
+	pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref));
+}
+
 static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
 {
 	struct btrfs_extent_item *ei;
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_extent_data_ref *dref;
 	struct btrfs_shared_data_ref *sref;
+	struct btrfs_extent_owner_ref *oref;
 	struct btrfs_disk_key key;
 	unsigned long end;
 	unsigned long ptr;
@@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
 			"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
 				     offset, eb->fs_info->sectorsize);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+			print_extent_owner_ref(eb, oref);
+			break;
 		default:
 			pr_cont("(extent %llu has INVALID ref type %d)\n",
 				  eb->start, type);
@@ -189,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
 	}
 }
 
+static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
+				  struct btrfs_stripe_extent *stripe)
+{
+	const int num_stripes = btrfs_num_raid_stripes(item_size);
+	const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
+
+	pr_info("\t\t\tencoding: %s\n",
+		(encoding && encoding < BTRFS_NR_RAID_TYPES) ?
+		btrfs_raid_array[encoding].raid_name : "unknown");
+
+	for (int i = 0; i < num_stripes; i++)
+		pr_info("\t\t\tstride %d devid %llu physical %llu\n",
+			i, btrfs_raid_stride_devid(eb, &stripe->strides[i]),
+			btrfs_raid_stride_physical(eb, &stripe->strides[i]));
+}
+
 /*
  * Helper to output refs and locking status of extent buffer.  Useful to debug
  * race condition related problems.
@@ -349,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l)
 			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
 					btrfs_item_size(l, i));
 			break;
+		case BTRFS_RAID_STRIPE_KEY:
+			print_raid_stripe_key(l, btrfs_item_size(l, i),
+				btrfs_item_ptr(l, i, struct btrfs_stripe_extent));
+			break;
 		}
 	}
 }
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 0755af0e53e3..f9bf591a0718 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -15,6 +15,7 @@
 #include "fs.h"
 #include "accessors.h"
 #include "super.h"
+#include "dir-item.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b99230db3c82..e46774e8f49f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,6 +30,25 @@
 #include "root-tree.h"
 #include "tree-checker.h"
 
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+{
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+		return BTRFS_QGROUP_MODE_DISABLED;
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
+		return BTRFS_QGROUP_MODE_SIMPLE;
+	return BTRFS_QGROUP_MODE_FULL;
+}
+
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
+}
+
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+{
+	return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
+}
+
 /*
  * Helpers to access qgroup reservation
  *
@@ -146,16 +165,6 @@ struct btrfs_qgroup_list {
 	struct btrfs_qgroup *member;
 };
 
-static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
-{
-	return (u64)(uintptr_t)qg;
-}
-
-static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
-{
-	return (struct btrfs_qgroup *)(uintptr_t)n->aux;
-}
-
 static int
 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 		   int init_flags);
@@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 	return NULL;
 }
 
-/* must be called with qgroup_lock held */
+/*
+ * Add qgroup to the filesystem's qgroup tree.
+ *
+ * Must be called with qgroup_lock held and @prealloc preallocated.
+ *
+ * The control on the lifespan of @prealloc would be transfered to this
+ * function, thus caller should no longer touch @prealloc.
+ */
 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+					  struct btrfs_qgroup *prealloc,
 					  u64 qgroupid)
 {
 	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct btrfs_qgroup *qgroup;
 
+	/* Caller must have pre-allocated @prealloc. */
+	ASSERT(prealloc);
+
 	while (*p) {
 		parent = *p;
 		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 
-		if (qgroup->qgroupid < qgroupid)
+		if (qgroup->qgroupid < qgroupid) {
 			p = &(*p)->rb_left;
-		else if (qgroup->qgroupid > qgroupid)
+		} else if (qgroup->qgroupid > qgroupid) {
 			p = &(*p)->rb_right;
-		else
+		} else {
+			kfree(prealloc);
 			return qgroup;
+		}
 	}
 
-	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
-	if (!qgroup)
-		return ERR_PTR(-ENOMEM);
-
+	qgroup = prealloc;
 	qgroup->qgroupid = qgroupid;
 	INIT_LIST_HEAD(&qgroup->groups);
 	INIT_LIST_HEAD(&qgroup->members);
 	INIT_LIST_HEAD(&qgroup->dirty);
+	INIT_LIST_HEAD(&qgroup->iterator);
+	INIT_LIST_HEAD(&qgroup->nested_iterator);
 
 	rb_link_node(&qgroup->node, parent, p);
 	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
@@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 /*
  * Add relation specified by two qgroups.
  *
- * Must be called with qgroup_lock held.
+ * Must be called with qgroup_lock held, the ownership of @prealloc is
+ * transferred to this function and caller should not touch it anymore.
  *
  * Return: 0        on success
  *         -ENOENT  if one of the qgroups is NULL
  *         <0       other errors
  */
-static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
+			     struct btrfs_qgroup *member,
+			     struct btrfs_qgroup *parent)
 {
-	struct btrfs_qgroup_list *list;
-
-	if (!member || !parent)
+	if (!member || !parent) {
+		kfree(prealloc);
 		return -ENOENT;
+	}
 
-	list = kzalloc(sizeof(*list), GFP_ATOMIC);
-	if (!list)
-		return -ENOMEM;
-
-	list->group = parent;
-	list->member = member;
-	list_add_tail(&list->next_group, &member->groups);
-	list_add_tail(&list->next_member, &parent->members);
+	prealloc->group = parent;
+	prealloc->member = member;
+	list_add_tail(&prealloc->next_group, &member->groups);
+	list_add_tail(&prealloc->next_member, &parent->members);
 
 	return 0;
 }
@@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
  *         -ENOENT  if one of the ids does not exist
  *         <0       other errors
  */
-static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+			   struct btrfs_qgroup_list *prealloc,
+			   u64 memberid, u64 parentid)
 {
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup *parent;
@@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare
 	member = find_qgroup_rb(fs_info, memberid);
 	parent = find_qgroup_rb(fs_info, parentid);
 
-	return __add_relation_rb(member, parent);
+	return __add_relation_rb(prealloc, member, parent);
 }
 
 /* Must be called with qgroup_lock held */
@@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 
 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
 {
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return;
 	fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
 				  BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
 				  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
 }
 
+static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
+				   struct extent_buffer *leaf, int slot,
+				   struct btrfs_qgroup_status_item *ptr)
+{
+	ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+	ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
+	fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
+}
+
 /*
  * The full config is read in one go, only called from open_ctree()
  * It doesn't use any locking, as at this point we're still single-threaded
@@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 	u64 flags = 0;
 	u64 rescan_progress = 0;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!fs_info->quota_root)
 		return 0;
 
 	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
@@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 				 "old qgroup version, quota disabled");
 				goto out;
 			}
-			if (btrfs_qgroup_status_generation(l, ptr) !=
-			    fs_info->generation) {
+			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
+			if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+				qgroup_read_enable_gen(fs_info, l, slot, ptr);
+			} else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
 				qgroup_mark_inconsistent(fs_info);
 				btrfs_err(fs_info,
 					"qgroup generation mismatch, marked as inconsistent");
 			}
-			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
-									  ptr);
 			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 			goto next1;
 		}
@@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 			qgroup_mark_inconsistent(fs_info);
 		}
 		if (!qgroup) {
-			qgroup = add_qgroup_rb(fs_info, found_key.offset);
-			if (IS_ERR(qgroup)) {
-				ret = PTR_ERR(qgroup);
+			struct btrfs_qgroup *prealloc;
+
+			prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+			if (!prealloc) {
+				ret = -ENOMEM;
 				goto out;
 			}
+			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 		}
 		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 		if (ret < 0)
@@ -489,6 +525,8 @@ next1:
 	if (ret)
 		goto out;
 	while (1) {
+		struct btrfs_qgroup_list *list = NULL;
+
 		slot = path->slots[0];
 		l = path->nodes[0];
 		btrfs_item_key_to_cpu(l, &found_key, slot);
@@ -502,8 +540,14 @@ next1:
 			goto next2;
 		}
 
-		ret = add_relation_rb(fs_info, found_key.objectid,
+		list = kzalloc(sizeof(*list), GFP_KERNEL);
+		if (!list) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = add_relation_rb(fs_info, list, found_key.objectid,
 				      found_key.offset);
+		list = NULL;
 		if (ret == -ENOENT) {
 			btrfs_warn(fs_info,
 				"orphan qgroup relation 0x%llx->0x%llx",
@@ -522,13 +566,12 @@ next2:
 out:
 	btrfs_free_path(path);
 	fs_info->qgroup_flags |= flags;
-	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
-		clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
-		 ret >= 0)
-		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-
-	if (ret < 0) {
+	if (ret >= 0) {
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
+			set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+			ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+	} else {
 		ulist_free(fs_info->qgroup_ulist);
 		fs_info->qgroup_ulist = NULL;
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 	struct rb_node *node;
 	bool ret = false;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
 		return ret;
 	/*
 	 * Since we're unmounting, there is no race and no need to grab qgroup
@@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 
 	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 
 	btrfs_free_path(path);
 	return ret;
@@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	btrfs_release_path(path);
 
@@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 	btrfs_set_qgroup_status_rescan(l, ptr,
 				fs_info->qgroup_rescan_progress.objectid);
 
-	btrfs_mark_buffer_dirty(l);
+	btrfs_mark_buffer_dirty(trans, l);
 
 out:
 	btrfs_free_path(path);
@@ -949,7 +992,8 @@ out:
 	return ret;
 }
 
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
@@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_qgroup *qgroup = NULL;
+	struct btrfs_qgroup *prealloc = NULL;
 	struct btrfs_trans_handle *trans = NULL;
 	struct ulist *ulist = NULL;
+	const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
 	int ret = 0;
 	int slot;
 
@@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 				 struct btrfs_qgroup_status_item);
 	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
 	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
-	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
-				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
+	if (simple) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+		btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+	} else {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
 	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
 				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	key.objectid = 0;
 	key.type = BTRFS_ROOT_REF_KEY;
@@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 			/* Release locks on tree_root before we access quota_root */
 			btrfs_release_path(path);
 
+			/* We should not have a stray @prealloc pointer. */
+			ASSERT(prealloc == NULL);
+			prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+			if (!prealloc) {
+				ret = -ENOMEM;
+				btrfs_abort_transaction(trans, ret);
+				goto out_free_path;
+			}
+
 			ret = add_qgroup_item(trans, quota_root,
 					      found_key.offset);
 			if (ret) {
@@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 				goto out_free_path;
 			}
 
-			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+			prealloc = NULL;
 			if (IS_ERR(qgroup)) {
 				ret = PTR_ERR(qgroup);
 				btrfs_abort_transaction(trans, ret);
@@ -1144,18 +1205,22 @@ out_add_root:
 		goto out_free_path;
 	}
 
-	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
-	if (IS_ERR(qgroup)) {
-		ret = PTR_ERR(qgroup);
-		btrfs_abort_transaction(trans, ret);
+	ASSERT(prealloc == NULL);
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
 		goto out_free_path;
 	}
+	qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
+	prealloc = NULL;
 	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
 
+	fs_info->qgroup_enable_gen = trans->transid;
+
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	/*
 	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
@@ -1180,8 +1245,14 @@ out_add_root:
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
 	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	if (simple)
+		btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
 	spin_unlock(&fs_info->qgroup_lock);
 
+	/* Skip rescan for simple qgroups. */
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		goto out_free_path;
+
 	ret = qgroup_rescan_init(fs_info, 0, 1);
 	if (!ret) {
 	        qgroup_rescan_zero_tracking(fs_info);
@@ -1222,6 +1293,39 @@ out:
 	else if (trans)
 		ret = btrfs_end_transaction(trans);
 	ulist_free(ulist);
+	kfree(prealloc);
+	return ret;
+}
+
+/*
+ * It is possible to have outstanding ordered extents which reserved bytes
+ * before we disabled. We need to fully flush delalloc, ordered extents, and a
+ * commit to ensure that we don't leak such reservations, only to have them
+ * come back if we re-enable.
+ *
+ * - enable simple quotas
+ * - reserve space
+ * - release it, store rsv_bytes in OE
+ * - disable quotas
+ * - enable simple quotas (qgroup rsv are all 0)
+ * - OE finishes
+ * - run delayed refs
+ * - free rsv_bytes, resulting in miscounting or even underflow
+ */
+static int flush_reservations(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+	if (ret)
+		return ret;
+	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+	trans = btrfs_join_transaction(fs_info->tree_root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	btrfs_commit_transaction(trans);
+
 	return ret;
 }
 
@@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 	btrfs_qgroup_wait_for_completion(fs_info, false);
 
+	ret = flush_reservations(fs_info);
+	if (ret)
+		goto out_unlock_cleaner;
+
 	/*
 	 * 1 For the root item
 	 *
@@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
 	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
 	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
 	spin_unlock(&fs_info->qgroup_lock);
 
@@ -1329,7 +1438,8 @@ out:
 	if (ret && trans)
 		btrfs_end_transaction(trans);
 	else if (trans)
-		ret = btrfs_end_transaction(trans);
+		ret = btrfs_commit_transaction(trans);
+out_unlock_cleaner:
 	mutex_unlock(&fs_info->cleaner_mutex);
 
 	return ret;
@@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
 		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
+static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+	if (!list_empty(&qgroup->iterator))
+		return;
+
+	list_add_tail(&qgroup->iterator, head);
+}
+
+static void qgroup_iterator_clean(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct btrfs_qgroup *qgroup;
+
+		qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
+		list_del_init(&qgroup->iterator);
+	}
+}
+
 /*
  * The easy accounting, we're updating qgroup relationship whose child qgroup
  * only has exclusive extents.
@@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
  *
  * Caller should hold fs_info->qgroup_lock.
  */
-static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
-				    struct ulist *tmp, u64 ref_root,
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
 				    struct btrfs_qgroup *src, int sign)
 {
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_qgroup_list *glist;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
+	struct btrfs_qgroup *cur;
+	LIST_HEAD(qgroup_list);
 	u64 num_bytes = src->excl;
 	int ret = 0;
 
@@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	if (!qgroup)
 		goto out;
 
-	qgroup->rfer += sign * num_bytes;
-	qgroup->rfer_cmpr += sign * num_bytes;
-
-	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
-	qgroup->excl += sign * num_bytes;
-	qgroup->excl_cmpr += sign * num_bytes;
-
-	if (sign > 0)
-		qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
-	else
-		qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-
-	qgroup_dirty(fs_info, qgroup);
-
-	/* Get all of the parent groups that contain this qgroup */
-	list_for_each_entry(glist, &qgroup->groups, next_group) {
-		ret = ulist_add(tmp, glist->group->qgroupid,
-				qgroup_to_aux(glist->group), GFP_ATOMIC);
-		if (ret < 0)
-			goto out;
-	}
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(cur, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
 
-	/* Iterate all of the parents and adjust their reference counts */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		qgroup = unode_aux_to_qgroup(unode);
 		qgroup->rfer += sign * num_bytes;
 		qgroup->rfer_cmpr += sign * num_bytes;
+
 		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
 		qgroup->excl += sign * num_bytes;
+		qgroup->excl_cmpr += sign * num_bytes;
+
 		if (sign > 0)
 			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
 		else
 			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-		qgroup->excl_cmpr += sign * num_bytes;
 		qgroup_dirty(fs_info, qgroup);
 
-		/* Add any parents of the parents */
-		list_for_each_entry(glist, &qgroup->groups, next_group) {
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+		/* Append parent qgroups to @qgroup_list. */
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
 	ret = 0;
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	return ret;
 }
 
@@ -1434,8 +1537,7 @@ out:
  * Return < 0 for other error.
  */
 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
-				   struct ulist *tmp, u64 src, u64 dst,
-				   int sign)
+				   u64 src, u64 dst, int sign)
 {
 	struct btrfs_qgroup *qgroup;
 	int ret = 1;
@@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
 		goto out;
 	if (qgroup->excl == qgroup->rfer) {
 		ret = 0;
-		err = __qgroup_excl_accounting(fs_info, tmp, dst,
-					       qgroup, sign);
+		err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
 		if (err < 0) {
 			ret = err;
 			goto out;
@@ -1459,28 +1560,19 @@ out:
 	return ret;
 }
 
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
-			      u64 dst)
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
-	struct ulist *tmp;
-	unsigned int nofs_flag;
+	struct btrfs_qgroup_list *prealloc = NULL;
 	int ret = 0;
 
 	/* Check the level of src and dst first */
 	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
 		return -EINVAL;
 
-	/* We hold a transaction handle open, must do a NOFS allocation. */
-	nofs_flag = memalloc_nofs_save();
-	tmp = ulist_alloc(GFP_KERNEL);
-	memalloc_nofs_restore(nofs_flag);
-	if (!tmp)
-		return -ENOMEM;
-
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
@@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 		}
 	}
 
+	prealloc = kzalloc(sizeof(*list), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ret = add_qgroup_relation_item(trans, src, dst);
 	if (ret)
 		goto out;
@@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
-	ret = __add_relation_rb(member, parent);
+	ret = __add_relation_rb(prealloc, member, parent);
+	prealloc = NULL;
 	if (ret < 0) {
 		spin_unlock(&fs_info->qgroup_lock);
 		goto out;
 	}
-	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+	ret = quick_update_accounting(fs_info, src, dst, 1);
 	spin_unlock(&fs_info->qgroup_lock);
 out:
+	kfree(prealloc);
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
-	ulist_free(tmp);
 	return ret;
 }
 
@@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 	struct btrfs_qgroup *parent;
 	struct btrfs_qgroup *member;
 	struct btrfs_qgroup_list *list;
-	struct ulist *tmp;
 	bool found = false;
-	unsigned int nofs_flag;
 	int ret = 0;
 	int ret2;
 
-	/* We hold a transaction handle open, must do a NOFS allocation. */
-	nofs_flag = memalloc_nofs_save();
-	tmp = ulist_alloc(GFP_KERNEL);
-	memalloc_nofs_restore(nofs_flag);
-	if (!tmp)
-		return -ENOMEM;
-
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
 		goto out;
@@ -1582,11 +1671,10 @@ delete_item:
 	if (found) {
 		spin_lock(&fs_info->qgroup_lock);
 		del_relation_rb(fs_info, src, dst);
-		ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+		ret = quick_update_accounting(fs_info, src, dst, -1);
 		spin_unlock(&fs_info->qgroup_lock);
 	}
 out:
-	ulist_free(tmp);
 	return ret;
 }
 
@@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *prealloc = NULL;
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
+		return 0;
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root) {
 		ret = -ENOTCONN;
@@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
 		goto out;
 	}
 
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = add_qgroup_item(trans, quota_root, qgroupid);
 	if (ret)
 		goto out;
 
 	spin_lock(&fs_info->qgroup_lock);
-	qgroup = add_qgroup_rb(fs_info, qgroupid);
+	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
 	spin_unlock(&fs_info->qgroup_lock);
+	prealloc = NULL;
 
-	if (IS_ERR(qgroup)) {
-		ret = PTR_ERR(qgroup);
-		goto out;
-	}
 	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 out:
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
+	kfree(prealloc);
 	return ret;
 }
 
@@ -1771,6 +1867,17 @@ out:
 	return ret;
 }
 
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 				struct btrfs_delayed_ref_root *delayed_refs,
 				struct btrfs_qgroup_extent_record *record)
@@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	struct btrfs_qgroup_extent_record *entry;
 	u64 bytenr = record->bytenr;
 
+	if (!btrfs_qgroup_full_accounting(fs_info))
+		return 1;
+
 	lockdep_assert_held(&delayed_refs->lock);
 	trace_btrfs_qgroup_trace_extent(fs_info, record);
 
@@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord)
 {
 	struct btrfs_backref_walk_ctx ctx = { 0 };
 	int ret;
 
+	if (!btrfs_qgroup_full_accounting(trans->fs_info))
+		return 0;
 	/*
 	 * We are always called in a context where we are already holding a
 	 * transaction handle. Often we are called when adding a data delayed
@@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 			      u64 num_bytes)
 {
@@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
-	    || bytenr == 0 || num_bytes == 0)
+	if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
 		return 0;
 	record = kzalloc(sizeof(*record), GFP_NOFS);
 	if (!record)
@@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	return btrfs_qgroup_trace_extent_post(trans, record);
 }
 
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb)
 {
@@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 	u64 bytenr, num_bytes;
 
 	/* We can be called directly from walk_up_proc() */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	for (i = 0; i < nr; i++) {
@@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 	int level;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	/* Wrong parameter order */
@@ -2319,6 +2470,16 @@ out:
 	return ret;
 }
 
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *root_eb,
 			       u64 root_gen, int root_level)
@@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
 	BUG_ON(root_eb == NULL);
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	spin_lock(&fs_info->qgroup_lock);
@@ -2445,62 +2606,64 @@ out:
 	return ret;
 }
 
+static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+	if (!list_empty(&qgroup->nested_iterator))
+		return;
+
+	list_add_tail(&qgroup->nested_iterator, head);
+}
+
+static void qgroup_iterator_nested_clean(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct btrfs_qgroup *qgroup;
+
+		qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
+		list_del_init(&qgroup->nested_iterator);
+	}
+}
+
 #define UPDATE_NEW	0
 #define UPDATE_OLD	1
 /*
  * Walk all of the roots that points to the bytenr and adjust their refcnts.
  */
-static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
-				struct ulist *roots, struct ulist *tmp,
-				struct ulist *qgroups, u64 seq, int update_old)
+static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+				 struct ulist *roots, struct list_head *qgroups,
+				 u64 seq, int update_old)
 {
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
-	struct ulist_node *tmp_unode;
-	struct ulist_iterator tmp_uiter;
 	struct btrfs_qgroup *qg;
-	int ret = 0;
 
 	if (!roots)
-		return 0;
+		return;
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(roots, &uiter))) {
+		LIST_HEAD(tmp);
+
 		qg = find_qgroup_rb(fs_info, unode->val);
 		if (!qg)
 			continue;
 
-		ulist_reinit(tmp);
-		ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
-				GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ULIST_ITER_INIT(&tmp_uiter);
-		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+		qgroup_iterator_nested_add(qgroups, qg);
+		qgroup_iterator_add(&tmp, qg);
+		list_for_each_entry(qg, &tmp, iterator) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = unode_aux_to_qgroup(tmp_unode);
 			if (update_old)
 				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
 			else
 				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+
 			list_for_each_entry(glist, &qg->groups, next_group) {
-				ret = ulist_add(qgroups, glist->group->qgroupid,
-						qgroup_to_aux(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
-				ret = ulist_add(tmp, glist->group->qgroupid,
-						qgroup_to_aux(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
+				qgroup_iterator_nested_add(qgroups, glist->group);
+				qgroup_iterator_add(&tmp, glist->group);
 			}
 		}
+		qgroup_iterator_clean(&tmp);
 	}
-	return 0;
 }
 
 /*
@@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
  * But this time we don't need to consider other things, the codes and logic
  * is easy to understand now.
  */
-static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
-				  struct ulist *qgroups,
-				  u64 nr_old_roots,
-				  u64 nr_new_roots,
-				  u64 num_bytes, u64 seq)
+static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
+				   struct list_head *qgroups, u64 nr_old_roots,
+				   u64 nr_new_roots, u64 num_bytes, u64 seq)
 {
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
 	struct btrfs_qgroup *qg;
-	u64 cur_new_count, cur_old_count;
 
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(qgroups, &uiter))) {
+	list_for_each_entry(qg, qgroups, nested_iterator) {
+		u64 cur_new_count, cur_old_count;
 		bool dirty = false;
 
-		qg = unode_aux_to_qgroup(unode);
 		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
 		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
@@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 		if (dirty)
 			qgroup_dirty(fs_info, qg);
 	}
-	return 0;
 }
 
 /*
@@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 				struct ulist *new_roots)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct ulist *qgroups = NULL;
-	struct ulist *tmp = NULL;
+	LIST_HEAD(qgroups);
 	u64 seq;
 	u64 nr_new_roots = 0;
 	u64 nr_old_roots = 0;
@@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	 * If quotas get disabled meanwhile, the resources need to be freed and
 	 * we can't just exit here.
 	 */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (!btrfs_qgroup_full_accounting(fs_info) ||
 	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
 		goto out_free;
 
@@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
 					num_bytes, nr_old_roots, nr_new_roots);
 
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto out_free;
-	}
-
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
 		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
@@ -2722,29 +2866,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	seq = fs_info->qgroup_seq;
 
 	/* Update old refcnts using old_roots */
-	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
-				   UPDATE_OLD);
-	if (ret < 0)
-		goto out;
+	qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
 
 	/* Update new refcnts using new_roots */
-	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
-				   UPDATE_NEW);
-	if (ret < 0)
-		goto out;
+	qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
 
-	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+	qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
 			       num_bytes, seq);
 
 	/*
+	 * We're done using the iterator, release all its qgroups while holding
+	 * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+	 * and trigger use-after-free accesses to qgroups.
+	 */
+	qgroup_iterator_nested_clean(&qgroups);
+
+	/*
 	 * Bump qgroup_seq to avoid seq overlap
 	 */
 	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
-out:
 	spin_unlock(&fs_info->qgroup_lock);
 out_free:
-	ulist_free(tmp);
-	ulist_free(qgroups);
 	ulist_free(old_roots);
 	ulist_free(new_roots);
 	return ret;
@@ -2761,6 +2903,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 	u64 qgroup_to_skip;
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return 0;
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
@@ -2876,7 +3021,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
 			qgroup_mark_inconsistent(fs_info);
 		spin_lock(&fs_info->qgroup_lock);
 	}
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (btrfs_qgroup_enabled(fs_info))
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
 	else
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -2889,6 +3034,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
 	return ret;
 }
 
+static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
+			       u64 inode_rootid,
+			       struct btrfs_qgroup_inherit **inherit)
+{
+	int i = 0;
+	u64 num_qgroups = 0;
+	struct btrfs_qgroup *inode_qg;
+	struct btrfs_qgroup_list *qg_list;
+	struct btrfs_qgroup_inherit *res;
+	size_t struct_sz;
+	u64 *qgids;
+
+	if (*inherit)
+		return -EEXIST;
+
+	inode_qg = find_qgroup_rb(fs_info, inode_rootid);
+	if (!inode_qg)
+		return -ENOENT;
+
+	num_qgroups = list_count_nodes(&inode_qg->groups);
+
+	if (!num_qgroups)
+		return 0;
+
+	struct_sz = struct_size(res, qgroups, num_qgroups);
+	if (struct_sz == SIZE_MAX)
+		return -ERANGE;
+
+	res = kzalloc(struct_sz, GFP_NOFS);
+	if (!res)
+		return -ENOMEM;
+	res->num_qgroups = num_qgroups;
+	qgids = res->qgroups;
+
+	list_for_each_entry(qg_list, &inode_qg->groups, next_group)
+		qgids[i] = qg_list->group->qgroupid;
+
+	*inherit = res;
+	return 0;
+}
+
 /*
  * Copy the accounting information between qgroups. This is necessary
  * when a snapshot or a subvolume is created. Throwing an error will
@@ -2896,7 +3082,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
  * when a readonly fs is a reasonable outcome.
  */
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
-			 u64 objectid, struct btrfs_qgroup_inherit *inherit)
+			 u64 objectid, u64 inode_rootid,
+			 struct btrfs_qgroup_inherit *inherit)
 {
 	int ret = 0;
 	int i;
@@ -2906,10 +3093,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *srcgroup;
 	struct btrfs_qgroup *dstgroup;
+	struct btrfs_qgroup *prealloc;
+	struct btrfs_qgroup_list **qlist_prealloc = NULL;
+	bool free_inherit = false;
 	bool need_rescan = false;
 	u32 level_size = 0;
 	u64 nums;
 
+	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+	if (!prealloc)
+		return -ENOMEM;
+
 	/*
 	 * There are only two callers of this function.
 	 *
@@ -2929,7 +3123,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 
 	if (!committing)
 		mutex_lock(&fs_info->qgroup_ioctl_lock);
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_enabled(fs_info))
 		goto out;
 
 	quota_root = fs_info->quota_root;
@@ -2938,6 +3132,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 		goto out;
 	}
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
+		ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
+		if (ret)
+			goto out;
+		free_inherit = true;
+	}
+
 	if (inherit) {
 		i_qgroups = (u64 *)(inherit + 1);
 		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
@@ -2982,16 +3183,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 				goto out;
 		}
 		ret = 0;
-	}
 
+		qlist_prealloc = kcalloc(inherit->num_qgroups,
+					 sizeof(struct btrfs_qgroup_list *),
+					 GFP_NOFS);
+		if (!qlist_prealloc) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		for (int i = 0; i < inherit->num_qgroups; i++) {
+			qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
+						    GFP_NOFS);
+			if (!qlist_prealloc[i]) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+	}
 
 	spin_lock(&fs_info->qgroup_lock);
 
-	dstgroup = add_qgroup_rb(fs_info, objectid);
-	if (IS_ERR(dstgroup)) {
-		ret = PTR_ERR(dstgroup);
-		goto unlock;
-	}
+	dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
+	prealloc = NULL;
 
 	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
 		dstgroup->lim_flags = inherit->lim.flags;
@@ -3003,7 +3216,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 		qgroup_dirty(fs_info, dstgroup);
 	}
 
-	if (srcid) {
+	if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
 		srcgroup = find_qgroup_rb(fs_info, srcid);
 		if (!srcgroup)
 			goto unlock;
@@ -3038,7 +3251,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 	i_qgroups = (u64 *)(inherit + 1);
 	for (i = 0; i < inherit->num_qgroups; ++i) {
 		if (*i_qgroups) {
-			ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+			ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
+					      *i_qgroups);
+			qlist_prealloc[i] = NULL;
 			if (ret)
 				goto unlock;
 		}
@@ -3102,6 +3317,14 @@ out:
 		mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	if (need_rescan)
 		qgroup_mark_inconsistent(fs_info);
+	if (qlist_prealloc) {
+		for (int i = 0; i < inherit->num_qgroups; i++)
+			kfree(qlist_prealloc[i]);
+		kfree(qlist_prealloc);
+	}
+	if (free_inherit)
+		kfree(inherit);
+	kfree(prealloc);
 	return ret;
 }
 
@@ -3125,8 +3348,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
+	LIST_HEAD(qgroup_list);
 
 	if (!is_fstree(ref_root))
 		return 0;
@@ -3146,49 +3368,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	if (!qgroup)
 		goto out;
 
-	/*
-	 * in a first step, we check all affected qgroups if any limits would
-	 * be exceeded
-	 */
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
 		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
-
-		if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+		if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
 			ret = -EDQUOT;
 			goto out;
 		}
 
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
+
 	ret = 0;
 	/*
 	 * no limits exceeded, now record the reservation into all qgroups
 	 */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
-
-		qg = unode_aux_to_qgroup(unode);
-
-		qgroup_rsv_add(fs_info, qg, num_bytes, type);
-	}
+	list_for_each_entry(qgroup, &qgroup_list, iterator)
+		qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
 
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 	return ret;
 }
@@ -3207,9 +3408,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       enum btrfs_qgroup_rsv_type type)
 {
 	struct btrfs_qgroup *qgroup;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
+	LIST_HEAD(qgroup_list);
 
 	if (!is_fstree(ref_root))
 		return;
@@ -3237,30 +3436,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 		 */
 		num_bytes = qgroup->rsv.values[type];
 
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
 		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
-
-		qgroup_rsv_release(fs_info, qg, num_bytes, type);
-
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
+		qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
+		list_for_each_entry(glist, &qgroup->groups, next_group) {
+			qgroup_iterator_add(&qgroup_list, glist->group);
 		}
 	}
-
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
@@ -3295,6 +3481,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
+	if (!btrfs_qgroup_full_accounting(fs_info))
+		return 1;
+
 	mutex_lock(&fs_info->qgroup_rescan_lock);
 	extent_root = btrfs_extent_root(fs_info,
 				fs_info->qgroup_rescan_progress.objectid);
@@ -3375,10 +3564,15 @@ out:
 
 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
 {
-	return btrfs_fs_closing(fs_info) ||
-		test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
-		!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
-			  fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+	if (btrfs_fs_closing(fs_info))
+		return true;
+	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+		return true;
+	if (!btrfs_qgroup_enabled(fs_info))
+		return true;
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+		return true;
+	return false;
 }
 
 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3392,6 +3586,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	bool stopped = false;
 	bool did_leaf_rescans = false;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		return;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
@@ -3495,6 +3692,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 {
 	int ret = 0;
 
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+		btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
+		return -EINVAL;
+	}
+
 	if (!init_flags) {
 		/* we're resuming qgroup rescan at mount time */
 		if (!(fs_info->qgroup_flags &
@@ -3525,7 +3727,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 			btrfs_warn(fs_info,
 			"qgroup rescan init failed, qgroup is not enabled");
 			ret = -EINVAL;
-		} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+		} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
 			/* Quota disable is in progress */
 			ret = -EBUSY;
 		}
@@ -3546,7 +3748,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 	mutex_unlock(&fs_info->qgroup_rescan_lock);
 
 	btrfs_init_work(&fs_info->qgroup_rescan_work,
-			btrfs_qgroup_rescan_worker, NULL, NULL);
+			btrfs_qgroup_rescan_worker, NULL);
 	return 0;
 }
 
@@ -3784,7 +3986,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
 	u64 to_reserve;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+	if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid) || len == 0)
 		return 0;
 
@@ -3855,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
 
 /* Free ranges specified by @reserved, normally in error path */
 static int qgroup_free_reserved_data(struct btrfs_inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len)
+				     struct extent_changeset *reserved,
+				     u64 start, u64 len, u64 *freed_ret)
 {
 	struct btrfs_root *root = inode->root;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
 	struct extent_changeset changeset;
-	int freed = 0;
+	u64 freed = 0;
 	int ret;
 
 	extent_changeset_init(&changeset);
@@ -3902,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
 	}
 	btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
 				  BTRFS_QGROUP_RSV_DATA);
-	ret = freed;
+	if (freed_ret)
+		*freed_ret = freed;
+	ret = 0;
 out:
 	extent_changeset_release(&changeset);
 	return ret;
@@ -3910,19 +4115,23 @@ out:
 
 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
 			struct extent_changeset *reserved, u64 start, u64 len,
-			int free)
+			u64 *released, int free)
 {
 	struct extent_changeset changeset;
 	int trace_op = QGROUP_RELEASE;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
-		return 0;
+	if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
+		extent_changeset_init(&changeset);
+		return clear_record_extent_bits(&inode->io_tree, start,
+						start + len - 1,
+						EXTENT_QGROUP_RESERVED, &changeset);
+	}
 
 	/* In release case, we shouldn't have @reserved */
 	WARN_ON(!free && reserved);
 	if (free && reserved)
-		return qgroup_free_reserved_data(inode, reserved, start, len);
+		return qgroup_free_reserved_data(inode, reserved, start, len, released);
 	extent_changeset_init(&changeset);
 	ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
 				       EXTENT_QGROUP_RESERVED, &changeset);
@@ -3937,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
 		btrfs_qgroup_free_refroot(inode->root->fs_info,
 				inode->root->root_key.objectid,
 				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
-	ret = changeset.bytes_changed;
+	if (released)
+		*released = changeset.bytes_changed;
 out:
 	extent_changeset_release(&changeset);
 	return ret;
@@ -3956,9 +4166,10 @@ out:
  * NOTE: This function may sleep for memory allocation.
  */
 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len)
+			   struct extent_changeset *reserved,
+			   u64 start, u64 len, u64 *freed)
 {
-	return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
+	return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
 }
 
 /*
@@ -3976,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
  *
  * NOTE: This function may sleep for memory allocation.
  */
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
 {
-	return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
+	return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
 }
 
 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
@@ -4027,7 +4238,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid) || num_bytes == 0)
 		return 0;
 
@@ -4064,11 +4275,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
 }
 
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 
@@ -4084,7 +4299,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 
@@ -4104,9 +4319,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 				int num_bytes)
 {
 	struct btrfs_qgroup *qgroup;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
+	LIST_HEAD(qgroup_list);
 
 	if (num_bytes == 0)
 		return;
@@ -4117,39 +4330,36 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 	qgroup = find_qgroup_rb(fs_info, ref_root);
 	if (!qgroup)
 		goto out;
-	ulist_reinit(fs_info->qgroup_ulist);
-	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-		       qgroup_to_aux(qgroup), GFP_ATOMIC);
-	if (ret < 0)
-		goto out;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
-		struct btrfs_qgroup *qg;
-		struct btrfs_qgroup_list *glist;
 
-		qg = unode_aux_to_qgroup(unode);
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qgroup, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
 
-		qgroup_rsv_release(fs_info, qg, num_bytes,
+		qgroup_rsv_release(fs_info, qgroup, num_bytes,
 				BTRFS_QGROUP_RSV_META_PREALLOC);
-		qgroup_rsv_add(fs_info, qg, num_bytes,
-				BTRFS_QGROUP_RSV_META_PERTRANS);
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(fs_info->qgroup_ulist,
-					glist->group->qgroupid,
-					qgroup_to_aux(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				goto out;
-		}
+		if (!sb_rdonly(fs_info->sb))
+			qgroup_rsv_add(fs_info, qgroup, num_bytes,
+				       BTRFS_QGROUP_RSV_META_PERTRANS);
+
+		list_for_each_entry(glist, &qgroup->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
 	}
 out:
+	qgroup_iterator_clean(&qgroup_list);
 	spin_unlock(&fs_info->qgroup_lock);
 }
 
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
 	    !is_fstree(root->root_key.objectid))
 		return;
 	/* Same as btrfs_qgroup_free_meta_prealloc() */
@@ -4257,7 +4467,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
 	int level = btrfs_header_level(subvol_parent) - 1;
 	int ret = 0;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
@@ -4367,7 +4577,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int i;
 
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 	if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
 		return 0;
@@ -4450,3 +4660,61 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
 	}
 	*root = RB_ROOT;
 }
+
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
+{
+	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+		return;
+
+	if (!is_fstree(root))
+		return;
+
+	btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
+}
+
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+			      struct btrfs_squota_delta *delta)
+{
+	int ret;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *qg;
+	LIST_HEAD(qgroup_list);
+	u64 root = delta->root;
+	u64 num_bytes = delta->num_bytes;
+	const int sign = (delta->is_inc ? 1 : -1);
+
+	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+		return 0;
+
+	if (!is_fstree(root))
+		return 0;
+
+	/* If the extent predates enabling quotas, don't count it. */
+	if (delta->generation < fs_info->qgroup_enable_gen)
+		return 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = find_qgroup_rb(fs_info, root);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = 0;
+	qgroup_iterator_add(&qgroup_list, qgroup);
+	list_for_each_entry(qg, &qgroup_list, iterator) {
+		struct btrfs_qgroup_list *glist;
+
+		qg->excl += num_bytes * sign;
+		qg->rfer += num_bytes * sign;
+		qgroup_dirty(fs_info, qg);
+
+		list_for_each_entry(glist, &qg->groups, next_group)
+			qgroup_iterator_add(&qgroup_list, glist->group);
+	}
+	qgroup_iterator_clean(&qgroup_list);
+
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+	return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7bffa10589d6..be18c862e64e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,8 +101,15 @@
  *     subtree rescan for them.
  */
 
-#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1UL << 3)
-#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1UL << 4)
+/*
+ * These flags share the flags field of the btrfs_qgroup_status_item with the
+ * persisted flags defined in btrfs_tree.h.
+ *
+ * To minimize the chance of collision with new persisted status flags, these
+ * count backwards from the MSB.
+ */
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN		(1ULL << 63)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING		(1ULL << 62)
 
 /*
  * Record a dirty extent, and info qgroup to update quota on it
@@ -220,6 +227,33 @@ struct btrfs_qgroup {
 	struct list_head groups;  /* groups this group is member of */
 	struct list_head members; /* groups that are members of this group */
 	struct list_head dirty;   /* dirty groups */
+
+	/*
+	 * For qgroup iteration usage.
+	 *
+	 * The iteration list should always be empty until qgroup_iterator_add()
+	 * is called.  And should be reset to empty after the iteration is
+	 * finished.
+	 */
+	struct list_head iterator;
+
+	/*
+	 * For nested iterator usage.
+	 *
+	 * Here we support at most one level of nested iterator calls like:
+	 *
+	 *	LIST_HEAD(all_qgroups);
+	 *	{
+	 *		LIST_HEAD(local_qgroups);
+	 *		qgroup_iterator_add(local_qgroups, qg);
+	 *		qgroup_iterator_nested_add(all_qgroups, qg);
+	 *		do_some_work(local_qgroups);
+	 *		qgroup_iterator_clean(local_qgroups);
+	 *	}
+	 *	do_some_work(all_qgroups);
+	 *	qgroup_iterator_nested_clean(all_qgroups);
+	 */
+	struct list_head nested_iterator;
 	struct rb_node node;	  /* tree of qgroups */
 
 	/*
@@ -235,6 +269,19 @@ struct btrfs_qgroup {
 	struct kobject kobj;
 };
 
+struct btrfs_squota_delta {
+	/* The fstree root this delta counts against. */
+	u64 root;
+	/* The number of bytes in the extent being counted. */
+	u64 num_bytes;
+	/* The generation the extent was created in. */
+	u64 generation;
+	/* Whether we are using or freeing the extent. */
+	bool is_inc;
+	/* Whether the extent is data or metadata. */
+	bool is_data;
+};
+
 static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
 {
 	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
@@ -249,14 +296,23 @@ enum {
 	ENUM_BIT(QGROUP_FREE),
 };
 
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode {
+	BTRFS_QGROUP_MODE_DISABLED,
+	BTRFS_QGROUP_MODE_FULL,
+	BTRFS_QGROUP_MODE_SIMPLE
+};
+
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+		       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
 int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
 				     bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
-			      u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 			      u64 dst);
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
@@ -267,80 +323,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
 
-/*
- * Inform qgroup to trace one dirty extent, its info is recorded in @record.
- * So qgroup can account it at transaction committing time.
- *
- * No lock version, caller must acquire delayed ref lock and allocated memory,
- * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
- *
- * Return 0 for success insert
- * Return >0 for existing record, caller can free @record safely.
- * Error is not possible
- */
 int btrfs_qgroup_trace_extent_nolock(
 		struct btrfs_fs_info *fs_info,
 		struct btrfs_delayed_ref_root *delayed_refs,
 		struct btrfs_qgroup_extent_record *record);
-
-/*
- * Post handler after qgroup_trace_extent_nolock().
- *
- * NOTE: Current qgroup does the expensive backref walk at transaction
- * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
- * new transaction.
- * This is designed to allow btrfs_find_all_roots() to get correct new_roots
- * result.
- *
- * However for old_roots there is no need to do backref walk at that time,
- * since we search commit roots to walk backref and result will always be
- * correct.
- *
- * Due to the nature of no lock version, we can't do backref there.
- * So we must call btrfs_qgroup_trace_extent_post() after exiting
- * spinlock context.
- *
- * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
- * using current root, then we can move all expensive backref walk out of
- * transaction committing, but not now as qgroup accounting will be wrong again.
- */
 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
 				   struct btrfs_qgroup_extent_record *qrecord);
-
-/*
- * Inform qgroup to trace one dirty extent, specified by @bytenr and
- * @num_bytes.
- * So qgroup can account it at commit trans time.
- *
- * Better encapsulated version, with memory allocation and backref walk for
- * commit roots.
- * So this can sleep.
- *
- * Return 0 if the operation is done.
- * Return <0 for error, like memory allocation failure or invalid parameter
- * (NULL trans)
- */
 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 			      u64 num_bytes);
-
-/*
- * Inform qgroup to trace all leaf items of data
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM)
- */
 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb);
-/*
- * Inform qgroup to trace a whole subtree, including all its child tree
- * blocks and data.
- * The root tree block is specified by @root_eb.
- *
- * Normally used by relocation(tree block swap) and subvolume deletion.
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM or tree search error)
- */
 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *root_eb,
 			       u64 root_gen, int root_level);
@@ -350,7 +342,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
-			 u64 objectid, struct btrfs_qgroup_inherit *inherit);
+			 u64 objectid, u64 inode_rootid,
+			 struct btrfs_qgroup_inherit *inherit);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes,
 			       enum btrfs_qgroup_rsv_type type);
@@ -363,10 +356,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 /* New io_tree based accurate qgroup reserve API */
 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len);
-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
 			   struct extent_changeset *reserved, u64 start,
-			   u64 len);
+			   u64 len, u64 *freed);
 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 			      enum btrfs_qgroup_rsv_type type, bool enforce);
 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -408,20 +401,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
 			BTRFS_QGROUP_RSV_META_PREALLOC);
 }
 
-/*
- * Per-transaction meta reservation should be all freed at transaction commit
- * time
- */
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
-
-/*
- * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
- *
- * This is called when preallocated meta reservation needs to be used.
- * Normally after btrfs_join_transaction() call.
- */
 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-
 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
 
 /* btrfs_qgroup_swapped_blocks related functions */
@@ -439,5 +420,8 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 		struct btrfs_root *root, struct extent_buffer *eb);
 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+			      struct btrfs_squota_delta *delta);
 
 #endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..9589362acfbf
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "print-tree.h"
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 found_start;
+	u64 found_end;
+	u64 end = start + length;
+	int slot;
+	int ret;
+
+	if (!stripe_root)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		key.objectid = start;
+		key.type = BTRFS_RAID_STRIPE_KEY;
+		key.offset = length;
+
+		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			ret = 0;
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		found_start = key.objectid;
+		found_end = found_start + key.offset;
+
+		/* That stripe ends before we start, we're done. */
+		if (found_end <= start)
+			break;
+
+		trace_btrfs_raid_extent_delete(fs_info, start, end,
+					       found_start, found_end);
+
+		ASSERT(found_start >= start && found_end <= end);
+		ret = btrfs_del_item(trans, stripe_root, path);
+		if (ret)
+			break;
+
+		btrfs_release_path(path);
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_io_context *bioc)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_key stripe_key;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+	u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
+	struct btrfs_stripe_extent *stripe_extent;
+	const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+	int ret;
+
+	stripe_extent = kzalloc(item_size, GFP_NOFS);
+	if (!stripe_extent) {
+		btrfs_abort_transaction(trans, -ENOMEM);
+		btrfs_end_transaction(trans);
+		return -ENOMEM;
+	}
+
+	trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
+					   num_stripes);
+	btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
+	for (int i = 0; i < num_stripes; i++) {
+		u64 devid = bioc->stripes[i].dev->devid;
+		u64 physical = bioc->stripes[i].physical;
+		u64 length = bioc->stripes[i].length;
+		struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+		if (length == 0)
+			length = bioc->size;
+
+		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+	}
+
+	stripe_key.objectid = bioc->logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = bioc->size;
+
+	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+				item_size);
+	if (ret)
+		btrfs_abort_transaction(trans, ret);
+
+	kfree(stripe_extent);
+
+	return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_extent *ordered_extent)
+{
+	struct btrfs_io_context *bioc;
+	int ret;
+
+	if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+		return 0;
+
+	list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+		ret = btrfs_insert_one_raid_extent(trans, bioc);
+		if (ret)
+			return ret;
+	}
+
+	while (!list_empty(&ordered_extent->bioc_list)) {
+		bioc = list_first_entry(&ordered_extent->bioc_list,
+					typeof(*bioc), rst_ordered_entry);
+		list_del(&bioc->rst_ordered_entry);
+		btrfs_put_bioc(bioc);
+	}
+
+	return 0;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 *length, u64 map_type,
+				 u32 stripe_index, struct btrfs_io_stripe *stripe)
+{
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_stripe_extent *stripe_extent;
+	struct btrfs_key stripe_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	const u64 end = logical + *length;
+	int num_stripes;
+	u8 encoding;
+	u64 offset;
+	u64 found_logical;
+	u64 found_length;
+	u64 found_end;
+	int slot;
+	int ret;
+
+	stripe_key.objectid = logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (stripe->is_scrub) {
+		path->skip_locking = 1;
+		path->search_commit_root = 1;
+	}
+
+	ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+	if (ret < 0)
+		goto free_path;
+	if (ret) {
+		if (path->slots[0] != 0)
+			path->slots[0]--;
+	}
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		found_logical = found_key.objectid;
+		found_length = found_key.offset;
+		found_end = found_logical + found_length;
+
+		if (found_logical > end) {
+			ret = -ENOENT;
+			goto out;
+		}
+
+		if (in_range(logical, found_logical, found_length))
+			break;
+
+		ret = btrfs_next_item(stripe_root, path);
+		if (ret)
+			goto out;
+	}
+
+	offset = logical - found_logical;
+
+	/*
+	 * If we have a logically contiguous, but physically non-continuous
+	 * range, we need to split the bio. Record the length after which we
+	 * must split the bio.
+	 */
+	if (end > found_end)
+		*length -= end - found_end;
+
+	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+	encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
+
+	if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
+		ret = -EUCLEAN;
+		btrfs_handle_fs_error(fs_info, ret,
+				      "on-disk stripe encoding %d doesn't match RAID index %d",
+				      encoding,
+				      btrfs_bg_flags_to_raid_index(map_type));
+		goto out;
+	}
+
+	for (int i = 0; i < num_stripes; i++) {
+		struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+		u64 devid = btrfs_raid_stride_devid(leaf, stride);
+		u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+		if (devid != stripe->dev->devid)
+			continue;
+
+		if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+			continue;
+
+		stripe->physical = physical + offset;
+
+		trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
+						   stripe->physical, devid);
+
+		ret = 0;
+		goto free_path;
+	}
+
+	/* If we're here, we haven't found the requested devid in the stripe. */
+	ret = -ENOENT;
+out:
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret && ret != -EIO && !stripe->is_scrub) {
+		if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+			btrfs_print_tree(leaf, 1);
+		btrfs_err(fs_info,
+		"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+			  logical, logical + *length, stripe->dev->devid,
+			  btrfs_bg_type_to_raid_name(map_type));
+	}
+free_path:
+	btrfs_free_path(path);
+
+	return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..cdb58b38fcb5
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK    (BTRFS_BLOCK_GROUP_DUP |		\
+					    BTRFS_BLOCK_GROUP_RAID1_MASK |	\
+					    BTRFS_BLOCK_GROUP_RAID0 |		\
+					    BTRFS_BLOCK_GROUP_RAID10)
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 *length, u64 map_type,
+				 u32 stripe_index, struct btrfs_io_stripe *stripe);
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+						 u64 map_type)
+{
+	u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+		return false;
+
+	if (type != BTRFS_BLOCK_GROUP_DATA)
+		return false;
+
+	if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
+		return true;
+
+	return false;
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+	return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
+		sizeof(struct btrfs_raid_stride);
+}
+
+#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 95d28497de7c..6486f0d7e993 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
 			ret = add_shared_data_ref(fs_info, offset, count,
 						  key->objectid, key->offset);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			btrfs_err(fs_info, "invalid key type in iref");
 			ret = -EINVAL;
@@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * Called when we modify a ref for a bytenr.
  *
  * This will add an action item to the given bytenr and do sanity checks to make
  * sure we haven't messed something up.  If we are making a new allocation and
@@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 
 	if (generic_ref->type == BTRFS_REF_METADATA) {
 		if (!parent)
-			ref_root = generic_ref->tree_ref.owning_root;
+			ref_root = generic_ref->tree_ref.ref_root;
 		owner = generic_ref->tree_ref.level;
 	} else if (!parent) {
-		ref_root = generic_ref->data_ref.owning_root;
+		ref_root = generic_ref->data_ref.ref_root;
 		owner = generic_ref->data_ref.ino;
 		offset = generic_ref->data_ref.offset;
 	}
@@ -791,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 			dump_ref_action(fs_info, ra);
 			kfree(ref);
 			kfree(ra);
+			kfree(re);
 			goto out_unlock;
 		} else if (be->num_refs == 0) {
 			btrfs_err(fs_info,
@@ -800,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 			dump_ref_action(fs_info, ra);
 			kfree(ref);
 			kfree(ra);
+			kfree(re);
 			goto out_unlock;
 		}
 
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 65d2bd6910f2..f88b0c2ac3fe 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     const u64 olen,
 				     int no_time_update)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	inode_inc_iversion(inode);
 	if (!no_time_update) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 	/*
 	 * We round up to the block size at eof when determining which
@@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 	}
 
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c6d4bb8cbe29..f5d9e5f74a52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -111,8 +111,8 @@ struct tree_block {
 	}; /* Use rb_simple_node for search/insert */
 	u64 owner;
 	struct btrfs_key key;
-	unsigned int level:8;
-	unsigned int key_ready:1;
+	u8 level;
+	bool key_ready;
 };
 
 #define MAX_EXTENTS 128
@@ -122,6 +122,13 @@ struct file_extent_cluster {
 	u64 end;
 	u64 boundary[MAX_EXTENTS];
 	unsigned int nr;
+	u64 owning_root;
+};
+
+/* Stages of data relocation. */
+enum reloc_stage {
+	MOVE_DATA_EXTENTS,
+	UPDATE_DATA_PTRS
 };
 
 struct reloc_control {
@@ -155,16 +162,12 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	unsigned int stage:8;
-	unsigned int create_reloc_tree:1;
-	unsigned int merge_reloc_tree:1;
-	unsigned int found_file_extent:1;
+	enum reloc_stage stage;
+	bool create_reloc_tree;
+	bool merge_reloc_tree;
+	bool found_file_extent;
 };
 
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS	0
-#define UPDATE_DATA_PTRS	1
-
 static void mark_block_processed(struct reloc_control *rc,
 				 struct btrfs_backref_node *node)
 {
@@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc,
 	node->processed = 1;
 }
 
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
-	tree->rb_root = RB_ROOT;
-	spin_lock_init(&tree->lock);
-}
-
 /*
  * walk up backref nodes until reach node presents tree root
  */
@@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
 	return 1;
 }
 
-static bool reloc_root_is_dead(struct btrfs_root *root)
+static bool reloc_root_is_dead(const struct btrfs_root *root)
 {
 	/*
 	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
@@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root)
  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
  * special case.
  */
-static bool have_reloc_root(struct btrfs_root *root)
+static bool have_reloc_root(const struct btrfs_root *root)
 {
 	if (reloc_root_is_dead(root))
 		return false;
@@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root)
 	return true;
 }
 
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
 
 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
-		return 0;
+		return false;
 
 	/* This root has been merged with its reloc tree, we can ignore it */
 	if (reloc_root_is_dead(root))
-		return 1;
+		return true;
 
 	reloc_root = root->reloc_root;
 	if (!reloc_root)
-		return 0;
+		return false;
 
 	if (btrfs_header_generation(reloc_root->commit_root) ==
 	    root->fs_info->running_transaction->transid)
-		return 0;
+		return false;
 	/*
-	 * if there is reloc tree and it was created in previous
-	 * transaction backref lookup can find the reloc tree,
-	 * so backref node for the fs tree root is useless for
-	 * relocation.
+	 * If there is reloc tree and it was created in previous transaction
+	 * backref lookup can find the reloc tree, so backref node for the fs
+	 * tree root is useless for relocation.
 	 */
-	return 1;
+	return true;
 }
 
 /*
@@ -547,7 +542,7 @@ out:
  */
 static int clone_backref_node(struct btrfs_trans_handle *trans,
 			      struct reloc_control *rc,
-			      struct btrfs_root *src,
+			      const struct btrfs_root *src,
 			      struct btrfs_root *dest)
 {
 	struct btrfs_root *reloc_root = src->reloc_root;
@@ -632,7 +627,7 @@ fail:
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
-static int __must_check __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct rb_node *rb_node;
@@ -1159,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
 		key.offset -= btrfs_file_extent_offset(leaf, fi);
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
-				       num_bytes, parent);
+				       num_bytes, parent, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 				    key.objectid, key.offset,
 				    root->root_key.objectid, false);
@@ -1170,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-				       num_bytes, parent);
+				       num_bytes, parent, root->root_key.objectid);
 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
 				    key.objectid, key.offset,
 				    root->root_key.objectid, false);
@@ -1181,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 		}
 	}
 	if (dirty)
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	if (inode)
 		btrfs_add_delayed_iput(BTRFS_I(inode));
 	return ret;
 }
 
-static noinline_for_stack
-int memcmp_node_keys(struct extent_buffer *eb, int slot,
-		     struct btrfs_path *path, int level)
+static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
+					       int slot, const struct btrfs_path *path,
+					       int level)
 {
 	struct btrfs_disk_key key1;
 	struct btrfs_disk_key key2;
@@ -1374,16 +1369,17 @@ again:
 		 */
 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
-		btrfs_mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(trans, parent);
 
 		btrfs_set_node_blockptr(path->nodes[level],
 					path->slots[level], old_bytenr);
 		btrfs_set_node_ptr_generation(path->nodes[level],
 					      path->slots[level], old_ptr_gen);
-		btrfs_mark_buffer_dirty(path->nodes[level]);
+		btrfs_mark_buffer_dirty(trans, path->nodes[level]);
 
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
-				       blocksize, path->nodes[level]->start);
+				       blocksize, path->nodes[level]->start,
+				       src->root_key.objectid);
 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
 				    0, true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1392,7 +1388,7 @@ again:
 			break;
 		}
 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
-				       blocksize, 0);
+				       blocksize, 0, dest->root_key.objectid);
 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
 				    true);
 		ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1401,8 +1397,9 @@ again:
 			break;
 		}
 
+		/* We don't know the real owning_root, use 0. */
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
-				       blocksize, path->nodes[level]->start);
+				       blocksize, path->nodes[level]->start, 0);
 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
 				    0, true);
 		ret = btrfs_free_extent(trans, &ref);
@@ -1411,8 +1408,9 @@ again:
 			break;
 		}
 
+		/* We don't know the real owning_root, use 0. */
 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
-				       blocksize, 0);
+				       blocksize, 0, 0);
 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
 				    0, true);
 		ret = btrfs_free_extent(trans, &ref);
@@ -1518,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
  * [min_key, max_key)
  */
 static int invalidate_extent_cache(struct btrfs_root *root,
-				   struct btrfs_key *min_key,
-				   struct btrfs_key *max_key)
+				   const struct btrfs_key *min_key,
+				   const struct btrfs_key *max_key)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct inode *inode = NULL;
@@ -1897,7 +1895,7 @@ again:
 		}
 	}
 
-	rc->merge_reloc_tree = 1;
+	rc->merge_reloc_tree = true;
 
 	while (!list_empty(&rc->reloc_roots)) {
 		reloc_root = list_entry(rc->reloc_roots.next,
@@ -2517,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 						node->eb->start);
 			btrfs_set_node_ptr_generation(upper->eb, slot,
 						      trans->transid);
-			btrfs_mark_buffer_dirty(upper->eb);
+			btrfs_mark_buffer_dirty(trans, upper->eb);
 
 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
 					       node->eb->start, blocksize,
-					       upper->eb->start);
+					       upper->eb->start,
+					       btrfs_header_owner(upper->eb));
 			btrfs_init_tree_ref(&ref, node->level,
 					    btrfs_header_owner(upper->eb),
 					    root->root_key.objectid, false);
@@ -2633,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
 	u32 blocksize = rc->extent_root->fs_info->nodesize;
 
 	if (test_range_bit(&rc->processed_blocks, bytenr,
-			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+			   bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
 		return 1;
 	return 0;
 }
@@ -2660,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
 	else
 		btrfs_node_key_to_cpu(eb, &block->key, 0);
 	free_extent_buffer(eb);
-	block->key_ready = 1;
+	block->key_ready = true;
 	return 0;
 }
 
@@ -2830,7 +2829,7 @@ out_free_blocks:
 
 static noinline_for_stack int prealloc_file_extent_cluster(
 				struct btrfs_inode *inode,
-				struct file_extent_cluster *cluster)
+				const struct file_extent_cluster *cluster)
 {
 	u64 alloc_hint = 0;
 	u64 start;
@@ -2965,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
 /*
  * Allow error injection to test balance/relocation cancellation
  */
-noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
 {
 	return atomic_read(&fs_info->balance_cancel_req) ||
 		atomic_read(&fs_info->reloc_cancel_req) ||
@@ -2973,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
 }
 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
 
-static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
 				    int cluster_nr)
 {
 	/* Last extent, use cluster end directly */
@@ -2985,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
 }
 
 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
-			     struct file_extent_cluster *cluster,
+			     const struct file_extent_cluster *cluster,
 			     int *cluster_nr, unsigned long page_index)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3120,7 +3119,7 @@ release_page:
 }
 
 static int relocate_file_extent_cluster(struct inode *inode,
-					struct file_extent_cluster *cluster)
+					const struct file_extent_cluster *cluster)
 {
 	u64 offset = BTRFS_I(inode)->index_cnt;
 	unsigned long index;
@@ -3158,11 +3157,12 @@ out:
 	return ret;
 }
 
-static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
-			 struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct inode *inode,
+				const struct btrfs_key *extent_key,
+				struct file_extent_cluster *cluster)
 {
 	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
 		ret = relocate_file_extent_cluster(inode, cluster);
@@ -3171,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
 		cluster->nr = 0;
 	}
 
-	if (!cluster->nr)
+	/*
+	 * Under simple quotas, we set root->relocation_src_root when we find
+	 * the extent. If adjacent extents have different owners, we can't merge
+	 * them while relocating. Handle this by storing the owning root that
+	 * started a cluster and if we see an extent from a different root break
+	 * cluster formation (just like the above case of non-adjacent extents).
+	 *
+	 * Without simple quotas, relocation_src_root is always 0, so we should
+	 * never see a mismatch, and it should have no effect on relocation
+	 * clusters.
+	 */
+	if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
+		u64 tmp = root->relocation_src_root;
+
+		/*
+		 * root->relocation_src_root is the state that actually affects
+		 * the preallocation we do here, so set it to the root owning
+		 * the cluster we need to relocate.
+		 */
+		root->relocation_src_root = cluster->owning_root;
+		ret = relocate_file_extent_cluster(inode, cluster);
+		if (ret)
+			return ret;
+		cluster->nr = 0;
+		/* And reset it back for the current extent's owning root. */
+		root->relocation_src_root = tmp;
+	}
+
+	if (!cluster->nr) {
 		cluster->start = extent_key->objectid;
+		cluster->owning_root = root->relocation_src_root;
+	}
 	else
 		BUG_ON(cluster->nr >= MAX_EXTENTS);
 	cluster->end = extent_key->objectid + extent_key->offset - 1;
@@ -3193,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
  * the major work is getting the generation and level of the block
  */
 static int add_tree_block(struct reloc_control *rc,
-			  struct btrfs_key *extent_key,
+			  const struct btrfs_key *extent_key,
 			  struct btrfs_path *path,
 			  struct rb_root *blocks)
 {
@@ -3278,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc,
 	block->key.objectid = rc->extent_root->fs_info->nodesize;
 	block->key.offset = generation;
 	block->level = level;
-	block->key_ready = 0;
+	block->key_ready = false;
 	block->owner = owner;
 
 	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
@@ -3444,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
 /*
  * helper to find all tree blocks that reference a given data extent
  */
-static noinline_for_stack
-int add_data_references(struct reloc_control *rc,
-			struct btrfs_key *extent_key,
-			struct btrfs_path *path,
-			struct rb_root *blocks)
+static noinline_for_stack int add_data_references(struct reloc_control *rc,
+						  const struct btrfs_key *extent_key,
+						  struct btrfs_path *path,
+						  struct rb_root *blocks)
 {
 	struct btrfs_backref_walk_ctx ctx = { 0 };
 	struct ulist_iterator leaf_uiter;
@@ -3622,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	if (ret)
 		return ret;
 
-	rc->create_reloc_tree = 1;
+	rc->create_reloc_tree = true;
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root);
@@ -3702,6 +3731,21 @@ restart:
 				    struct btrfs_extent_item);
 		flags = btrfs_extent_flags(path->nodes[0], ei);
 
+		/*
+		 * If we are relocating a simple quota owned extent item, we
+		 * need to note the owner on the reloc data root so that when
+		 * we allocate the replacement item, we can attribute it to the
+		 * correct eventual owner (rather than the reloc data root).
+		 */
+		if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+			struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
+			u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
+								 path->nodes[0],
+								 path->slots[0]);
+
+			root->relocation_src_root = owning_root_id;
+		}
+
 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 			ret = add_tree_block(rc, &key, path, &blocks);
 		} else if (rc->stage == UPDATE_DATA_PTRS &&
@@ -3734,7 +3778,7 @@ restart:
 
 		if (rc->stage == MOVE_DATA_EXTENTS &&
 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
-			rc->found_file_extent = 1;
+			rc->found_file_extent = true;
 			ret = relocate_data_extent(rc->data_inode,
 						   &key, &rc->cluster);
 			if (ret < 0) {
@@ -3771,7 +3815,7 @@ restart:
 			err = ret;
 	}
 
-	rc->create_reloc_tree = 0;
+	rc->create_reloc_tree = false;
 	set_reloc_control(rc);
 
 	btrfs_backref_release_cache(&rc->backref_cache);
@@ -3789,7 +3833,7 @@ restart:
 
 	merge_reloc_roots(rc);
 
-	rc->merge_reloc_tree = 0;
+	rc->merge_reloc_tree = false;
 	unset_reloc_control(rc);
 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
 
@@ -3835,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
 					  BTRFS_INODE_PREALLOC);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3874,9 +3918,9 @@ out:
  * helper to create inode for data relocation.
  * the inode is in data relocation tree and its link count is 0
  */
-static noinline_for_stack
-struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_group *group)
+static noinline_for_stack struct inode *create_reloc_inode(
+					struct btrfs_fs_info *fs_info,
+					const struct btrfs_block_group *group)
 {
 	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
@@ -3971,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
-	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
-	mapping_tree_init(&rc->reloc_root_tree);
+	btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
+	rc->reloc_root_tree.rb_root = RB_ROOT;
+	spin_lock_init(&rc->reloc_root_tree.lock);
 	extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
 	return rc;
 }
@@ -4004,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
 		   block_group->start, buf);
 }
 
-static const char *stage_to_string(int stage)
+static const char *stage_to_string(enum reloc_stage stage)
 {
 	if (stage == MOVE_DATA_EXTENTS)
 		return "move data extents";
@@ -4120,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 	WARN_ON(ret && ret != -EAGAIN);
 
 	while (1) {
-		int finishes_stage;
+		enum reloc_stage finishes_stage;
 
 		mutex_lock(&fs_info->cleaner_mutex);
 		ret = relocate_block_group(rc);
@@ -4303,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
 		goto out_unset;
 	}
 
-	rc->merge_reloc_tree = 1;
+	rc->merge_reloc_tree = true;
 
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
@@ -4422,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
 }
 
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct btrfs_root *root,
+			  const struct extent_buffer *buf,
 			  struct extent_buffer *cow)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4561,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
  *
  * Return U64_MAX if no running relocation.
  */
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
 {
 	u64 logical = U64_MAX;
 
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 77d69f6ae967..5fb60f2deb53 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered);
 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct btrfs_root *root,
+			  const struct extent_buffer *buf,
 			  struct extent_buffer *cow);
 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
 
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 859874579456..603ad1459368 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
 }
 
 /*
- * btrfs_find_root - lookup the root by the key.
+ * Lookup the root by the key.
+ *
  * root: the root of the root tree
  * search_key: the key to search
  * path: the path we search
@@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
 
 	write_extent_buffer(l, item, ptr, sizeof(*item));
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -438,7 +439,7 @@ again:
 	btrfs_set_root_ref_name_len(leaf, ref, name->len);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(leaf, name->name, ptr, name->len);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	if (key.type == BTRFS_ROOT_BACKREF_KEY) {
 		btrfs_release_path(path);
@@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 }
 
 /*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * Reserve space for subvolume operation.
+ *
  * root: the root of the parent directory
  * rsv: block reservation
  * items: the number of items that we need do reservation
@@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+	if (btrfs_qgroup_enabled(fs_info)) {
 		/* One for parent inode, two for dir entries */
 		qgroup_num_bytes = 3 * fs_info->nodesize;
 		ret = btrfs_qgroup_reserve_meta_prealloc(root,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index cbbaca32126e..8b2c3859e464 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,6 +3,8 @@
 #ifndef BTRFS_ROOT_TREE_H
 #define BTRFS_ROOT_TREE_H
 
+struct fscrypt_str;
+
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems, bool use_global_rsv);
@@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key,
 		      struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_key *key,
-				   struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_key *key, struct btrfs_root_item *item);
 int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
 		    struct btrfs_path *path, struct btrfs_root_item *root_item,
 		    struct btrfs_key *root_key);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b877203f1dc5..f62a408671cb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,7 +16,6 @@
 #include "backref.h"
 #include "extent_io.h"
 #include "dev-replace.h"
-#include "check-integrity.h"
 #include "raid56.h"
 #include "block-group.h"
 #include "zoned.h"
@@ -24,6 +23,7 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "scrub.h"
+#include "raid-stripe-tree.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -897,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
 		ASSERT(stripe->mirror_num >= 1);
 		ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
 				      stripe->logical, &mapped_len, &bioc,
-				      NULL, NULL, 1);
+				      NULL, NULL);
 		/*
 		 * If we failed, dev will be NULL, and later detailed reports
 		 * will just be skipped.
@@ -1635,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
 	}
 }
 
+static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
+					    struct scrub_stripe *stripe)
+{
+	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+	struct btrfs_bio *bbio = NULL;
+	u64 stripe_len = BTRFS_STRIPE_LEN;
+	int mirror = stripe->mirror_num;
+	int i;
+
+	atomic_inc(&stripe->pending_io);
+
+	for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+		struct page *page = scrub_stripe_get_page(stripe, i);
+		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+		/* The current sector cannot be merged, submit the bio. */
+		if (bbio &&
+		    ((i > 0 &&
+		      !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+		     bbio->bio.bi_iter.bi_size >= stripe_len)) {
+			ASSERT(bbio->bio.bi_iter.bi_size);
+			atomic_inc(&stripe->pending_io);
+			btrfs_submit_bio(bbio, mirror);
+			bbio = NULL;
+		}
+
+		if (!bbio) {
+			struct btrfs_io_stripe io_stripe = {};
+			struct btrfs_io_context *bioc = NULL;
+			const u64 logical = stripe->logical +
+					    (i << fs_info->sectorsize_bits);
+			int err;
+
+			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+					       fs_info, scrub_read_endio, stripe);
+			bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+
+			io_stripe.is_scrub = true;
+			err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+					      &stripe_len, &bioc, &io_stripe,
+					      &mirror);
+			btrfs_put_bioc(bioc);
+			if (err) {
+				btrfs_bio_end_io(bbio,
+						 errno_to_blk_status(err));
+				return;
+			}
+		}
+
+		__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+	}
+
+	if (bbio) {
+		ASSERT(bbio->bio.bi_iter.bi_size);
+		atomic_inc(&stripe->pending_io);
+		btrfs_submit_bio(bbio, mirror);
+	}
+
+	if (atomic_dec_and_test(&stripe->pending_io)) {
+		wake_up(&stripe->io_wait);
+		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+	}
+}
+
 static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 				      struct scrub_stripe *stripe)
 {
@@ -1646,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 	ASSERT(stripe->mirror_num > 0);
 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
 
+	if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
+		scrub_submit_extent_sector_read(sctx, stripe);
+		return;
+	}
+
 	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
 			       scrub_read_endio, stripe);
 
@@ -1798,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 	 */
 	ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
 
+	/* @found_logical_ret must be specified. */
+	ASSERT(found_logical_ret);
+
 	stripe = &sctx->stripes[sctx->cur_stripe];
 	scrub_reset_stripe(stripe);
 	ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1806,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
 	/* Either >0 as no more extents or <0 for error. */
 	if (ret)
 		return ret;
-	if (found_logical_ret)
-		*found_logical_ret = stripe->logical;
+	*found_logical_ret = stripe->logical;
 	sctx->cur_stripe++;
 
 	/* We filled one group, submit it. */
@@ -1952,7 +2024,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
 
 	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
-			      &length, &bioc, NULL, NULL, 1);
+			      &length, &bioc, NULL, NULL);
 	if (ret < 0) {
 		btrfs_put_bioc(bioc);
 		btrfs_bio_counter_dec(fs_info);
@@ -2010,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 
 	/* Go through each extent items inside the logical range */
 	while (cur_logical < logical_end) {
-		u64 found_logical;
+		u64 found_logical = U64_MAX;
 		u64 cur_physical = physical + cur_logical - logical_start;
 
 		/* Canceled? */
@@ -2045,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 		if (ret < 0)
 			break;
 
+		/* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+		ASSERT(found_logical != U64_MAX);
 		cur_logical = found_logical + BTRFS_STRIPE_LEN;
 
 		/* Don't hold CPU for too long time */
@@ -2717,7 +2791,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 	if (scrub_dev->fs_devices != fs_info->fs_devices)
 		gen = scrub_dev->generation;
 	else
-		gen = fs_info->last_trans_committed;
+		gen = btrfs_get_last_trans_committed(fs_info);
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3a566150c531..4e36550618e5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx)
 	put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
 	put_unaligned_le32(0, &hdr->crc);
 
-	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	put_unaligned_le32(crc, &hdr->crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5669,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
 	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
 	hdr->crc = 0;
-	crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
-	crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+	crc = crc32c(0, sctx->send_buf, sctx->send_size);
+	crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
 	hdr->crc = cpu_to_le32(crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -8158,7 +8158,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 	}
 
 	sctx->send_filp = fget(arg->send_fd);
-	if (!sctx->send_filp) {
+	if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
 		ret = -EBADF;
 		goto out;
 	}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d7e8cd4f140c..571bb13587d5 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 			  struct btrfs_space_info *space_info,
 			  enum btrfs_reserve_flush_enum flush)
 {
+	struct btrfs_space_info *data_sinfo;
 	u64 profile;
 	u64 avail;
+	u64 data_chunk_size;
 	int factor;
 
 	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 	 */
 	factor = btrfs_bg_type_to_factor(profile);
 	avail = div_u64(avail, factor);
+	if (avail == 0)
+		return 0;
+
+	/*
+	 * Calculate the data_chunk_size, space_info->chunk_size is the
+	 * "optimal" chunk size based on the fs size.  However when we actually
+	 * allocate the chunk we will strip this down further, making it no more
+	 * than 10% of the disk or 1G, whichever is smaller.
+	 */
+	data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+	data_chunk_size = min(data_sinfo->chunk_size,
+			      mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+	data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+
+	/*
+	 * Since data allocations immediately use block groups as part of the
+	 * reservation, because we assume that data reservations will == actual
+	 * usage, we could potentially overcommit and then immediately have that
+	 * available space used by a data allocation, which could put us in a
+	 * bind when we get close to filling the file system.
+	 *
+	 * To handle this simply remove the data_chunk_size from the available
+	 * space.  If we are relatively empty this won't affect our ability to
+	 * overcommit much, and if we're very close to full it'll keep us from
+	 * getting into a position where we've given ourselves very little
+	 * metadata wiggle room.
+	 */
+	if (avail <= data_chunk_size)
+		return 0;
+	avail -= data_chunk_size;
 
 	/*
 	 * If we aren't flushing all things, let us overcommit up to
@@ -556,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
 	return nr;
 }
 
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
-				       u64 to_reclaim)
-{
-	const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
-	u64 nr;
-
-	nr = div64_u64(to_reclaim, bytes);
-	if (!nr)
-		nr = 1;
-	return nr;
-}
-
 #define EXTENT_SIZE_PER_ITEM	SZ_256K
 
 /*
@@ -749,10 +769,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 			break;
 		}
 		if (state == FLUSH_DELAYED_REFS_NR)
-			nr = calc_delayed_refs_nr(fs_info, num_bytes);
+			btrfs_run_delayed_refs(trans, num_bytes);
 		else
-			nr = 0;
-		btrfs_run_delayed_refs(trans, nr);
+			btrfs_run_delayed_refs(trans, 0);
 		btrfs_end_transaction(trans);
 		break;
 	case ALLOC_CHUNK:
@@ -978,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
  * @fs_info - fs_info for this fs
  * @space_info - the space info we were flushing
  *
@@ -1742,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  * Try to reserve metadata bytes from the block_rsv's space.
  *
  * @fs_info:    the filesystem
- * @block_rsv:  block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
  * @orig_bytes: number of bytes we want
  * @flush:      whether or not we can flush to make our reservation
  *
@@ -1754,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  * space already.
  */
 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv,
+				 struct btrfs_space_info *space_info,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush)
 {
 	int ret;
 
-	ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+	ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
 	if (ret == -ENOSPC) {
 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
-					      block_rsv->space_info->flags,
-					      orig_bytes, 1);
+					      space_info->flags, orig_bytes, 1);
 
 		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
-			btrfs_dump_space_info(fs_info, block_rsv->space_info,
-					      orig_bytes, 0);
+			btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 0bb9d14e60a8..92c595fed1b0 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,7 @@
 #ifndef BTRFS_SPACE_INFO_H
 #define BTRFS_SPACE_INFO_H
 
+#include <trace/events/btrfs.h>
 #include "volumes.h"
 
 /*
@@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 			   struct btrfs_space_info *info, u64 bytes,
 			   int dump_block_groups);
 int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv,
+				 struct btrfs_space_info *space_info,
 				 u64 orig_bytes,
 				 enum btrfs_reserve_flush_enum flush);
 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a093ec0f7e3..ef256b944c72 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/ratelimit.h>
 #include <linux/crc32c.h>
 #include <linux/btrfs.h>
+#include <linux/security.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -79,7 +80,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
 static void btrfs_put_super(struct super_block *sb)
 {
-	close_ctree(btrfs_sb(sb));
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+	btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid);
+	close_ctree(fs_info);
 }
 
 enum {
@@ -129,9 +133,6 @@ enum {
 	Opt_inode_cache, Opt_noinode_cache,
 
 	/* Debugging options */
-	Opt_check_integrity,
-	Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask,
 	Opt_enospc_debug, Opt_noenospc_debug,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
@@ -200,9 +201,6 @@ static const match_table_t tokens = {
 	{Opt_recovery, "recovery"},
 
 	/* Debugging options */
-	{Opt_check_integrity, "check_int"},
-	{Opt_check_integrity_including_extent_data, "check_int_data"},
-	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
 	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_noenospc_debug, "noenospc_debug"},
 #ifdef CONFIG_BTRFS_DEBUG
@@ -707,44 +705,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 		case Opt_skip_balance:
 			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
 			break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-		case Opt_check_integrity_including_extent_data:
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info,
-				   "enabling check integrity including extent data");
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity:
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info, "enabling check integrity");
-			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
-			break;
-		case Opt_check_integrity_print_mask:
-			ret = match_int(&args[0], &intarg);
-			if (ret) {
-				btrfs_err(info,
-				"unrecognized check_integrity_print_mask value %s",
-					args[0].from);
-				goto out;
-			}
-			info->check_integrity_print_mask = intarg;
-			btrfs_warn(info,
-	"integrity checker is deprecated and will be removed in 6.7");
-			btrfs_info(info, "check_integrity_print_mask 0x%x",
-				   info->check_integrity_print_mask);
-			break;
-#else
-		case Opt_check_integrity_including_extent_data:
-		case Opt_check_integrity:
-		case Opt_check_integrity_print_mask:
-			btrfs_err(info,
-				  "support for check_integrity* not compiled in!");
-			ret = -EINVAL;
-			goto out;
-#endif
 		case Opt_fatal_errors:
 			if (strcmp(args[0].from, "panic") == 0) {
 				btrfs_set_opt(info->mount_opt,
@@ -889,7 +849,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
 				error = -ENOMEM;
 				goto out;
 			}
-			device = btrfs_scan_one_device(device_name, flags);
+			device = btrfs_scan_one_device(device_name, flags, false);
 			kfree(device_name);
 			if (IS_ERR(device)) {
 				error = PTR_ERR(device);
@@ -1305,15 +1265,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(info, SKIP_BALANCE))
 		seq_puts(seq, ",skip_balance");
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-	if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
-		seq_puts(seq, ",check_int_data");
-	else if (btrfs_test_opt(info, CHECK_INTEGRITY))
-		seq_puts(seq, ",check_int");
-	if (info->check_integrity_print_mask)
-		seq_printf(seq, ",check_int_print_mask=%d",
-				info->check_integrity_print_mask);
-#endif
 	if (info->metadata_ratio)
 		seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
 	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
@@ -1484,7 +1435,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 		goto error_fs_info;
 	}
 
-	device = btrfs_scan_one_device(device_name, mode);
+	/*
+	 * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+	 * either a valid device or an error.
+	 */
+	device = btrfs_scan_one_device(device_name, mode, true);
+	ASSERT(device != NULL);
 	if (IS_ERR(device)) {
 		mutex_unlock(&uuid_mutex);
 		error = PTR_ERR(device);
@@ -1519,7 +1475,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+		shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
 					s->s_id);
 		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data);
@@ -2196,7 +2152,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+		/*
+		 * Scanning outside of mount can return NULL which would turn
+		 * into 0 error code.
+		 */
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
 		ret = PTR_ERR_OR_ZERO(device);
 		mutex_unlock(&uuid_mutex);
 		break;
@@ -2210,8 +2170,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		break;
 	case BTRFS_IOC_DEVICES_READY:
 		mutex_lock(&uuid_mutex);
-		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
-		if (IS_ERR(device)) {
+		/*
+		 * Scanning outside of mount can return NULL which would turn
+		 * into 0 error code.
+		 */
+		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+		if (IS_ERR_OR_NULL(device)) {
 			mutex_unlock(&uuid_mutex);
 			ret = PTR_ERR(device);
 			break;
@@ -2256,6 +2220,7 @@ static int check_dev_super(struct btrfs_device *dev)
 {
 	struct btrfs_fs_info *fs_info = dev->fs_info;
 	struct btrfs_super_block *sb;
+	u64 last_trans;
 	u16 csum_type;
 	int ret = 0;
 
@@ -2291,10 +2256,10 @@ static int check_dev_super(struct btrfs_device *dev)
 	if (ret < 0)
 		goto out;
 
-	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+	last_trans = btrfs_get_last_trans_committed(fs_info);
+	if (btrfs_super_generation(sb) != last_trans) {
 		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
-			btrfs_super_generation(sb),
-			fs_info->last_trans_committed);
+			  btrfs_super_generation(sb), last_trans);
 		ret = -EUCLEAN;
 		goto out;
 	}
@@ -2404,9 +2369,6 @@ static int __init btrfs_print_mod_info(void)
 #ifdef CONFIG_BTRFS_ASSERT
 			", assert=on"
 #endif
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-			", integrity-checker=on"
-#endif
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
 			", ref-verify=on"
 #endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b1d1ac25237b..e6b51fb3ddc1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
 #ifdef CONFIG_BLK_DEV_ZONED
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
 #endif
 #ifdef CONFIG_BTRFS_DEBUG
 /* Remove once support for extent tree v2 is feature complete */
 BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
 #endif
 #ifdef CONFIG_FS_VERITY
 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(free_space_tree),
 	BTRFS_FEAT_ATTR_PTR(raid1c34),
 	BTRFS_FEAT_ATTR_PTR(block_group_tree),
+	BTRFS_FEAT_ATTR_PTR(simple_quota),
 #ifdef CONFIG_BLK_DEV_ZONED
 	BTRFS_FEAT_ATTR_PTR(zoned),
 #endif
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+	BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
 #endif
 #ifdef CONFIG_FS_VERITY
 	BTRFS_FEAT_ATTR_PTR(verity),
@@ -420,6 +425,13 @@ static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *bu
 }
 BTRFS_ATTR(static_feature, acl, acl_show);
 
+static ssize_t temp_fsid_supported_show(struct kobject *kobj,
+					struct kobj_attribute *a, char *buf)
+{
+	return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show);
+
 /*
  * Features which only depend on kernel version.
  *
@@ -433,6 +445,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
 	BTRFS_ATTR_PTR(static_feature, send_stream_version),
 	BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
 	BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+	BTRFS_ATTR_PTR(static_feature, temp_fsid),
 	NULL
 };
 
@@ -1196,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-	return sysfs_emit(buf, "%llu\n", fs_info->generation);
+	return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info));
 }
 BTRFS_ATTR(, generation, btrfs_generation_show);
 
+static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
+				    struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid);
+}
+BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
+
 static const char * const btrfs_read_policy_name[] = { "pid" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1302,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, read_policy),
 	BTRFS_ATTR_PTR(, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(, commit_stats),
+	BTRFS_ATTR_PTR(, temp_fsid),
 	NULL,
 };
 
@@ -2086,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
 }
 BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
 
+static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj,
+				struct kobj_attribute *a,
+				char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+	ssize_t ret = 0;
+
+	spin_lock(&fs_info->qgroup_lock);
+	ASSERT(btrfs_qgroup_enabled(fs_info));
+	switch (btrfs_qgroup_mode(fs_info)) {
+	case BTRFS_QGROUP_MODE_FULL:
+		ret = sysfs_emit(buf, "qgroup\n");
+		break;
+	case BTRFS_QGROUP_MODE_SIMPLE:
+		ret = sysfs_emit(buf, "squota\n");
+		break;
+	default:
+		btrfs_warn(fs_info, "unexpected qgroup mode %d\n",
+			   btrfs_qgroup_mode(fs_info));
+		break;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+
+	return ret;
+}
+BTRFS_ATTR(qgroups, mode, qgroup_mode_show);
+
 static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
 					struct kobj_attribute *a,
 					char *buf)
@@ -2148,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = {
 	BTRFS_ATTR_PTR(qgroups, enabled),
 	BTRFS_ATTR_PTR(qgroups, inconsistent),
 	BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+	BTRFS_ATTR_PTR(qgroups, mode),
 	NULL
 };
 ATTRIBUTE_GROUPS(qgroups);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 5ef0b90e25c3..6a43a64ba55a 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 	key.offset = 0;
 
-	btrfs_setup_item_for_insert(root, path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, path, &key, value_len);
 	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
 			    value_len);
 
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 05b03f5eab83..492d69d2fa73 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = start;
 
-	btrfs_setup_item_for_insert(root, &path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	btrfs_set_file_extent_generation(leaf, fi, 1);
 	btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
-	btrfs_setup_item_for_insert(root, &path, &key, value_len);
+	/*
+	 * Passing a NULL trans handle is fine here, we have a dummy root eb
+	 * and the tree is a single node (level 0).
+	 */
+	btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
 }
 
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c780d3729463..5b3333ceef04 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -37,8 +37,6 @@
 
 static struct kmem_cache *btrfs_trans_handle_cachep;
 
-#define BTRFS_ROOT_TRANS_TAG 0
-
 /*
  * Transaction states and transitions
  *
@@ -386,7 +384,7 @@ loop:
 			IO_TREE_TRANS_DIRTY_PAGES);
 	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
 			IO_TREE_FS_PINNED_EXTENTS);
-	fs_info->generation++;
+	btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
 	cur_trans->aborted = 0;
@@ -561,6 +559,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
 	return true;
 }
 
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+					enum btrfs_reserve_flush_enum flush,
+					u64 num_bytes,
+					u64 *delayed_refs_bytes)
+{
+	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+	u64 extra_delayed_refs_bytes = 0;
+	u64 bytes;
+	int ret;
+
+	/*
+	 * If there's a gap between the size of the delayed refs reserve and
+	 * its reserved space, than some tasks have added delayed refs or bumped
+	 * its size otherwise (due to block group creation or removal, or block
+	 * group item update). Also try to allocate that gap in order to prevent
+	 * using (and possibly abusing) the global reserve when committing the
+	 * transaction.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+	    !btrfs_block_rsv_full(delayed_refs_rsv)) {
+		spin_lock(&delayed_refs_rsv->lock);
+		if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+			extra_delayed_refs_bytes = delayed_refs_rsv->size -
+				delayed_refs_rsv->reserved;
+		spin_unlock(&delayed_refs_rsv->lock);
+	}
+
+	bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+
+	/*
+	 * We want to reserve all the bytes we may need all at once, so we only
+	 * do 1 enospc flushing cycle per transaction start.
+	 */
+	ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+	if (ret == 0) {
+		if (extra_delayed_refs_bytes > 0)
+			btrfs_migrate_to_delayed_refs_rsv(fs_info,
+							  extra_delayed_refs_bytes);
+		return 0;
+	}
+
+	if (extra_delayed_refs_bytes > 0) {
+		bytes -= extra_delayed_refs_bytes;
+		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+		if (ret == 0)
+			return 0;
+	}
+
+	/*
+	 * If we are an emergency flush, which can steal from the global block
+	 * reserve, then attempt to not reserve space for the delayed refs, as
+	 * we will consume space for them from the global block reserve.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+		bytes -= *delayed_refs_bytes;
+		*delayed_refs_bytes = 0;
+		ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+	}
+
+	return ret;
+}
+
 static struct btrfs_trans_handle *
 start_transaction(struct btrfs_root *root, unsigned int num_items,
 		  unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -568,10 +629,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+	struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
 	u64 qgroup_reserved = 0;
+	u64 delayed_refs_bytes = 0;
 	bool reloc_reserved = false;
 	bool do_chunk_alloc = false;
 	int ret;
@@ -594,9 +657,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items && root != fs_info->chunk_root) {
-		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
-		u64 delayed_refs_bytes = 0;
-
 		qgroup_reserved = num_items * fs_info->nodesize;
 		/*
 		 * Use prealloc for now, as there might be a currently running
@@ -608,20 +668,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 		if (ret)
 			return ERR_PTR(ret);
 
+		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
 		/*
-		 * We want to reserve all the bytes we may need all at once, so
-		 * we only do 1 enospc flushing cycle per transaction start.  We
-		 * accomplish this by simply assuming we'll do num_items worth
-		 * of delayed refs updates in this trans handle, and refill that
-		 * amount for whatever is missing in the reserve.
+		 * If we plan to insert/update/delete "num_items" from a btree,
+		 * we will also generate delayed refs for extent buffers in the
+		 * respective btree paths, so reserve space for the delayed refs
+		 * that will be generated by the caller as it modifies btrees.
+		 * Try to reserve them to avoid excessive use of the global
+		 * block reserve.
 		 */
-		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
-		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-		    !btrfs_block_rsv_full(delayed_refs_rsv)) {
-			delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
-									  num_items);
-			num_bytes += delayed_refs_bytes;
-		}
+		delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
 
 		/*
 		 * Do the reservation for the relocation root creation
@@ -631,16 +687,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 			reloc_reserved = true;
 		}
 
-		ret = btrfs_reserve_metadata_bytes(fs_info, rsv, num_bytes, flush);
+		ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+						   &delayed_refs_bytes);
 		if (ret)
 			goto reserve_fail;
-		if (delayed_refs_bytes) {
-			btrfs_migrate_to_delayed_refs_rsv(fs_info, delayed_refs_bytes);
-			num_bytes -= delayed_refs_bytes;
-		}
-		btrfs_block_rsv_add_bytes(rsv, num_bytes, true);
 
-		if (rsv->space_info->force_alloc)
+		btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+		if (trans_rsv->space_info->force_alloc)
 			do_chunk_alloc = true;
 	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
 		   !btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -700,6 +754,7 @@ again:
 
 	h->type = type;
 	INIT_LIST_HEAD(&h->new_bgs);
+	btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
 
 	smp_mb();
 	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -712,8 +767,17 @@ again:
 	if (num_bytes) {
 		trace_btrfs_space_reservation(fs_info, "transaction",
 					      h->transid, num_bytes, 1);
-		h->block_rsv = &fs_info->trans_block_rsv;
+		h->block_rsv = trans_rsv;
 		h->bytes_reserved = num_bytes;
+		if (delayed_refs_bytes > 0) {
+			trace_btrfs_space_reservation(fs_info,
+						      "local_delayed_refs_rsv",
+						      h->transid,
+						      delayed_refs_bytes, 1);
+			h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+			btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+			delayed_refs_bytes = 0;
+		}
 		h->reloc_reserved = reloc_reserved;
 	}
 
@@ -769,8 +833,10 @@ join_fail:
 	kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
 	if (num_bytes)
-		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
-					num_bytes, NULL);
+		btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+	if (delayed_refs_bytes)
+		btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+						    delayed_refs_bytes);
 reserve_fail:
 	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 	return ERR_PTR(ret);
@@ -817,7 +883,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
 }
 
 /*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
  *
  * It is used when we want to commit the current the transaction, but
  * don't want to start a new one.
@@ -836,7 +902,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 }
 
 /*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
  *
  * It is similar to the above function, the difference is this one
  * will wait for all the inactive transactions until they fully
@@ -912,7 +978,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
 	int ret = 0;
 
 	if (transid) {
-		if (transid <= fs_info->last_trans_committed)
+		if (transid <= btrfs_get_last_trans_committed(fs_info))
 			goto out;
 
 		/* find specified transaction */
@@ -936,7 +1002,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
 		 * raced with btrfs_commit_transaction
 		 */
 		if (!cur_trans) {
-			if (transid > fs_info->last_trans_committed)
+			if (transid > btrfs_get_last_trans_committed(fs_info))
 				ret = -EINVAL;
 			goto out;
 		}
@@ -991,11 +1057,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
 
 	if (!trans->block_rsv) {
 		ASSERT(!trans->bytes_reserved);
+		ASSERT(!trans->delayed_refs_bytes_reserved);
 		return;
 	}
 
-	if (!trans->bytes_reserved)
+	if (!trans->bytes_reserved) {
+		ASSERT(!trans->delayed_refs_bytes_reserved);
 		return;
+	}
 
 	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
 	trace_btrfs_space_reservation(fs_info, "transaction",
@@ -1003,6 +1072,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
 	btrfs_block_rsv_release(fs_info, trans->block_rsv,
 				trans->bytes_reserved, NULL);
 	trans->bytes_reserved = 0;
+
+	if (!trans->delayed_refs_bytes_reserved)
+		return;
+
+	trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+				      trans->transid,
+				      trans->delayed_refs_bytes_reserved, 0);
+	btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+				trans->delayed_refs_bytes_reserved, NULL);
+	trans->delayed_refs_bytes_reserved = 0;
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1334,7 +1413,7 @@ again:
 	}
 
 	/* Now flush any delayed refs generated by updating all of the roots */
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret)
 		return ret;
 
@@ -1349,7 +1428,7 @@ again:
 		 * so we want to keep this flushing in this loop to make sure
 		 * everything gets run.
 		 */
-		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+		ret = btrfs_run_delayed_refs(trans, U64_MAX);
 		if (ret)
 			return ret;
 	}
@@ -1484,45 +1563,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 }
 
 /*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_trans_handle *trans;
-	int ret;
-
-	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
-		return 0;
-
-	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			break;
-		}
-
-		ret = btrfs_defrag_leaves(trans, root);
-
-		btrfs_end_transaction(trans);
-		btrfs_btree_balance_dirty(info);
-		cond_resched();
-
-		if (btrfs_fs_closing(info) || ret != -EAGAIN)
-			break;
-
-		if (btrfs_defrag_cancelled(info)) {
-			btrfs_debug(info, "defrag_root cancelled");
-			ret = -EAGAIN;
-			break;
-		}
-	}
-	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
-	return ret;
-}
-
-/*
  * Do all special snapshot related qgroup dirty hack.
  *
  * Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1539,11 +1579,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	int ret;
 
 	/*
-	 * Save some performance in the case that qgroups are not
-	 * enabled. If this check races with the ioctl, rescan will
-	 * kick in anyway.
+	 * Save some performance in the case that qgroups are not enabled. If
+	 * this check races with the ioctl, rescan will kick in anyway.
 	 */
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 0;
 
 	/*
@@ -1567,7 +1606,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 * for now flush the delayed refs to narrow the race window where the
 	 * qgroup counters could end up wrong.
 	 */
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;
@@ -1582,7 +1621,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 
 	/* Now qgroup are all updated, we can inherit it to new qgroups */
 	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
-				   inherit);
+				   parent->root_key.objectid, inherit);
 	if (ret < 0)
 		goto out;
 
@@ -1732,6 +1771,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
+	ret = btrfs_create_qgroup(trans, objectid);
+	if (ret && ret != -EEXIST) {
+		btrfs_abort_transaction(trans, ret);
+		goto fail;
+	}
+
 	/*
 	 * pull in the delayed directory update
 	 * and the delayed inode item
@@ -1843,8 +1888,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * To co-operate with that hack, we do hack again.
 	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
 	 */
-	ret = qgroup_account_snapshot(trans, root, parent_root,
-				      pending->inherit, objectid);
+	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+		ret = qgroup_account_snapshot(trans, root, parent_root,
+					      pending->inherit, objectid);
+	else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+		ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
+					   parent_root->root_key.objectid, pending->inherit);
 	if (ret < 0)
 		goto fail;
 
@@ -1860,8 +1909,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
 						  fname.disk_name.len * 2);
-	parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
-	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+	inode_set_mtime_to_ts(parent_inode,
+			      inode_set_ctime_current(parent_inode));
+	ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -2084,7 +2134,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
        struct btrfs_block_group *block_group, *tmp;
 
        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-               btrfs_delayed_refs_rsv_release(fs_info, 1);
+               btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
                list_del_init(&block_group->bg_list);
        }
 }
@@ -2403,7 +2453,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret)
 		goto unlock_reloc;
 
-	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, U64_MAX);
 	if (ret)
 		goto unlock_reloc;
 
@@ -2536,7 +2586,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
 		btrfs_clear_space_info_full(fs_info);
 
-	fs_info->last_trans_committed = cur_trans->transid;
+	btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
 	/*
 	 * We needn't acquire the lock here because there is no other task
 	 * which can change it.
@@ -2654,18 +2704,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
  */
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int errno, bool first_hit)
+				      unsigned int line, int error, bool first_hit)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 
-	WRITE_ONCE(trans->aborted, errno);
-	WRITE_ONCE(trans->transaction->aborted, errno);
-	if (first_hit && errno == -ENOSPC)
+	WRITE_ONCE(trans->aborted, error);
+	WRITE_ONCE(trans->transaction->aborted, error);
+	if (first_hit && error == -ENOSPC)
 		btrfs_dump_space_info_for_trans_abort(fs_info);
 	/* Wake up anybody who may be waiting on this transaction */
 	wake_up(&fs_info->transaction_wait);
 	wake_up(&fs_info->transaction_blocked_wait);
-	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+	__btrfs_handle_fs_error(fs_info, function, line, error, NULL);
 }
 
 int __init btrfs_transaction_init(void)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93869cda6af9..2bf8bbdfd0b3 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -12,6 +12,9 @@
 #include "ctree.h"
 #include "misc.h"
 
+/* Radix-tree tag for roots that are part of the trasaction. */
+#define BTRFS_ROOT_TRANS_TAG			0
+
 enum btrfs_trans_state {
 	TRANS_STATE_RUNNING,
 	TRANS_STATE_COMMIT_PREP,
@@ -118,8 +121,10 @@ enum {
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 delayed_refs_bytes_reserved;
 	u64 chunk_bytes_reserved;
 	unsigned long delayed_ref_updates;
+	unsigned long delayed_ref_csum_deletions;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
@@ -139,6 +144,7 @@ struct btrfs_trans_handle {
 	bool in_fsync;
 	struct btrfs_fs_info *fs_info;
 	struct list_head new_bgs;
+	struct btrfs_block_rsv delayed_rsv;
 };
 
 /*
@@ -172,7 +178,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
 	spin_lock(&inode->lock);
 	inode->last_trans = trans->transaction->transid;
-	inode->last_sub_trans = inode->root->log_transid;
+	inode->last_sub_trans = btrfs_get_root_log_transid(inode->root);
 	inode->last_log_commit = inode->last_sub_trans - 1;
 	spin_unlock(&inode->lock);
 }
@@ -200,32 +206,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
 	delayed_refs->qgroup_to_skip = 0;
 }
 
-bool __cold abort_should_print_stack(int errno);
+bool __cold abort_should_print_stack(int error);
 
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact stack trace is reported for some errors.
  */
-#define btrfs_abort_transaction(trans, errno)		\
+#define btrfs_abort_transaction(trans, error)		\
 do {								\
 	bool first = false;					\
 	/* Report first abort since mount */			\
 	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
 			&((trans)->fs_info->fs_state))) {	\
 		first = true;					\
-		if (WARN(abort_should_print_stack(errno),	\
+		if (WARN(abort_should_print_stack(error),	\
 			KERN_ERR				\
 			"BTRFS: Transaction aborted (error %d)\n",	\
-			(errno))) {					\
+			(error))) {					\
 			/* Stack trace printed. */			\
 		} else {						\
 			btrfs_err((trans)->fs_info,			\
 				  "Transaction aborted (error %d)",	\
-				  (errno));			\
+				  (error));			\
 		}						\
 	}							\
 	__btrfs_abort_transaction((trans), __func__,		\
-				  __LINE__, (errno), first);	\
+				  __LINE__, (error), first);	\
 } while (0)
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -243,7 +249,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
 
 void btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root);
 void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
 int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
@@ -264,7 +269,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 				      const char *function,
-				      unsigned int line, int errno, bool first_hit);
+				      unsigned int line, int error, bool first_hit);
 
 int __init btrfs_transaction_init(void);
 void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ab08a0b01311..50fdc69fdddf 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -29,6 +29,9 @@
 #include "accessors.h"
 #include "file-item.h"
 #include "inode-item.h"
+#include "dir-item.h"
+#include "raid-stripe-tree.h"
+#include "extent-tree.h"
 
 /*
  * Error message should follow the following format:
@@ -1274,6 +1277,8 @@ static int check_extent_item(struct extent_buffer *leaf,
 	unsigned long ptr;	/* Current pointer inside inline refs */
 	unsigned long end;	/* Extent item end */
 	const u32 item_size = btrfs_item_size(leaf, slot);
+	u8 last_type = 0;
+	u64 last_seq = U64_MAX;
 	u64 flags;
 	u64 generation;
 	u64 total_refs;		/* Total refs in btrfs_extent_item */
@@ -1320,6 +1325,18 @@ static int check_extent_item(struct extent_buffer *leaf,
 	 *    2.2) Ref type specific data
 	 *         Either using btrfs_extent_inline_ref::offset, or specific
 	 *         data structure.
+	 *
+	 *    All above inline items should follow the order:
+	 *
+	 *    - All btrfs_extent_inline_ref::type should be in an ascending
+	 *      order
+	 *
+	 *    - Within the same type, the items should follow a descending
+	 *      order by their sequence number. The sequence number is
+	 *      determined by:
+	 *      * btrfs_extent_inline_ref::offset for all types  other than
+	 *        EXTENT_DATA_REF
+	 *      * hash_extent_data_ref() for EXTENT_DATA_REF
 	 */
 	if (unlikely(item_size < sizeof(*ei))) {
 		extent_err(leaf, slot,
@@ -1401,6 +1418,7 @@ static int check_extent_item(struct extent_buffer *leaf,
 		struct btrfs_extent_inline_ref *iref;
 		struct btrfs_extent_data_ref *dref;
 		struct btrfs_shared_data_ref *sref;
+		u64 seq;
 		u64 dref_offset;
 		u64 inline_offset;
 		u8 inline_type;
@@ -1414,6 +1432,7 @@ static int check_extent_item(struct extent_buffer *leaf,
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		inline_type = btrfs_extent_inline_ref_type(leaf, iref);
 		inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+		seq = inline_offset;
 		if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
 			extent_err(leaf, slot,
 "inline ref item overflows extent item, ptr %lu iref size %u end %lu",
@@ -1444,6 +1463,10 @@ static int check_extent_item(struct extent_buffer *leaf,
 		case BTRFS_EXTENT_DATA_REF_KEY:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 			dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
+			seq = hash_extent_data_ref(
+					btrfs_extent_data_ref_root(leaf, dref),
+					btrfs_extent_data_ref_objectid(leaf, dref),
+					btrfs_extent_data_ref_offset(leaf, dref));
 			if (unlikely(!IS_ALIGNED(dref_offset,
 						 fs_info->sectorsize))) {
 				extent_err(leaf, slot,
@@ -1465,11 +1488,32 @@ static int check_extent_item(struct extent_buffer *leaf,
 			}
 			inline_refs += btrfs_shared_data_ref_count(leaf, sref);
 			break;
+		case BTRFS_EXTENT_OWNER_REF_KEY:
+			WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			break;
 		default:
 			extent_err(leaf, slot, "unknown inline ref type: %u",
 				   inline_type);
 			return -EUCLEAN;
 		}
+		if (inline_type < last_type) {
+			extent_err(leaf, slot,
+				   "inline ref out-of-order: has type %u, prev type %u",
+				   inline_type, last_type);
+			return -EUCLEAN;
+		}
+		/* Type changed, allow the sequence starts from U64_MAX again. */
+		if (inline_type > last_type)
+			last_seq = U64_MAX;
+		if (seq > last_seq) {
+			extent_err(leaf, slot,
+"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
+				   inline_type, inline_offset, seq,
+				   last_type, last_seq);
+			return -EUCLEAN;
+		}
+		last_type = inline_type;
+		last_seq = seq;
 		ptr += btrfs_extent_inline_ref_size(inline_type);
 	}
 	/* No padding is allowed */
@@ -1631,6 +1675,44 @@ static int check_inode_ref(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+				    const struct btrfs_key *key, int slot)
+{
+	struct btrfs_stripe_extent *stripe_extent =
+		btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+	if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+		generic_err(leaf, slot,
+"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
+			    key->objectid, leaf->fs_info->sectorsize);
+		return -EUCLEAN;
+	}
+
+	if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
+		generic_err(leaf, slot,
+	"RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
+		return -EUCLEAN;
+	}
+
+	switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
+	case BTRFS_STRIPE_RAID0:
+	case BTRFS_STRIPE_RAID1:
+	case BTRFS_STRIPE_DUP:
+	case BTRFS_STRIPE_RAID10:
+	case BTRFS_STRIPE_RAID5:
+	case BTRFS_STRIPE_RAID6:
+	case BTRFS_STRIPE_RAID1C3:
+	case BTRFS_STRIPE_RAID1C4:
+		break;
+	default:
+		generic_err(leaf, slot, "invalid raid stripe encoding %u",
+			    btrfs_stripe_extent_encoding(leaf, stripe_extent));
+		return -EUCLEAN;
+	}
+
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -1685,6 +1767,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 	case BTRFS_EXTENT_DATA_REF_KEY:
 		ret = check_extent_data_ref(leaf, key, slot);
 		break;
+	case BTRFS_RAID_STRIPE_KEY:
+		ret = check_raid_stripe_extent(leaf, key, slot);
+		break;
 	}
 
 	if (ret)
@@ -2005,7 +2090,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
 	 * So we only checks tree blocks which is read from disk, whose
 	 * generation <= fs_info->last_trans_committed.
 	 */
-	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+	if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
 		return 0;
 
 	/* We have @first_key, so this @eb must have at least one item */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cbb17b542131..7d6729d9fd2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log,
 	}
 
 	if (wc->pin) {
-		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
-						      eb->len);
+		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
 		if (ret)
 			return ret;
 
@@ -504,9 +503,9 @@ insert:
 		found_size = btrfs_item_size(path->nodes[0],
 						path->slots[0]);
 		if (found_size > item_size)
-			btrfs_truncate_item(path, item_size, 1);
+			btrfs_truncate_item(trans, path, item_size, 1);
 		else if (found_size < item_size)
-			btrfs_extend_item(path, item_size - found_size);
+			btrfs_extend_item(trans, path, item_size - found_size);
 	} else if (ret) {
 		return ret;
 	}
@@ -574,7 +573,7 @@ insert:
 		}
 	}
 no_copy:
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			} else if (ret == 0) {
 				btrfs_init_generic_ref(&ref,
 						BTRFS_ADD_DELAYED_REF,
-						ins.objectid, ins.offset, 0);
+						ins.objectid, ins.offset, 0,
+						root->root_key.objectid);
 				btrfs_init_data_ref(&ref,
 						root->root_key.objectid,
 						key->objectid, offset, 0, false);
@@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 update_inode:
 	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 out:
 	iput(inode);
 	return ret;
@@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			if (ret)
 				goto out;
 
-			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+			ret = btrfs_update_inode(trans, BTRFS_I(inode));
 			if (ret)
 				goto out;
 		}
@@ -1483,8 +1483,7 @@ out:
 	return ret;
 }
 
-static int count_inode_extrefs(struct btrfs_root *root,
-		struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
 {
 	int ret = 0;
 	int name_len;
@@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root,
 	struct extent_buffer *leaf;
 
 	while (1) {
-		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
-					    &extref, &offset);
+		ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+					    path, &extref, &offset);
 		if (ret)
 			break;
 
@@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
 	return nlink;
 }
 
-static int count_inode_refs(struct btrfs_root *root,
-			struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root,
 	key.offset = (u64)-1;
 
 	while (1) {
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
 		if (ret > 0) {
@@ -1595,9 +1593,9 @@ process_slot:
  * will free the inode.
  */
 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
 					   struct inode *inode)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	int ret;
 	u64 nlink = 0;
@@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = count_inode_refs(root, BTRFS_I(inode), path);
+	ret = count_inode_refs(BTRFS_I(inode), path);
 	if (ret < 0)
 		goto out;
 
 	nlink = ret;
 
-	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+	ret = count_inode_extrefs(BTRFS_I(inode), path);
 	if (ret < 0)
 		goto out;
 
@@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	if (nlink != inode->i_nlink) {
 		set_nlink(inode, nlink);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (ret)
 			goto out;
 	}
@@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 			break;
 		}
 
-		ret = fixup_inode_link_count(trans, root, inode);
+		ret = fixup_inode_link_count(trans, inode);
 		iput(inode);
 		if (ret)
 			break;
@@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			set_nlink(inode, 1);
 		else
 			inc_nlink(inode);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	} else if (ret == -EEXIST) {
 		ret = 0;
 	}
@@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 out:
 	if (!ret && update_size) {
 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+		ret = btrfs_update_inode(trans, BTRFS_I(dir));
 	}
 	kfree(name.name);
 	iput(dir);
@@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 							drop_args.bytes_found);
 					/* Update the inode's nbytes. */
 					ret = btrfs_update_inode(wc->trans,
-							root, BTRFS_I(inode));
+								 BTRFS_I(inode));
 				}
 				iput(inode);
 				if (ret)
@@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(eb);
 
 	if (trans) {
-		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+		ret = btrfs_pin_reserved_extent(trans, eb);
 		if (ret)
 			return ret;
 		btrfs_redirty_list_add(trans->transaction, eb);
@@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
 }
 
 /*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it.  When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
  *
  * Any other return value means you need to call btrfs_commit_transaction.
  * Some of the edge cases for fsyncing directories that have had unlinks
@@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_root_node(&log->root_item, log->node);
 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
 
-	root->log_transid++;
+	btrfs_set_root_log_transid(root, root->log_transid + 1);
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
 	/*
@@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	ret = update_log_root(trans, log, &new_root_item);
 	if (ret) {
-		if (!list_empty(&root_log_ctx.list))
-			list_del_init(&root_log_ctx.list);
-
+		list_del_init(&root_log_ctx.list);
 		blk_finish_plug(&plug);
 		btrfs_set_log_full_commit(trans);
 		if (ret != -ENOSPC)
@@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * someone else already started it. We use <= and not < because the
 	 * first log transaction has an ID of 0.
 	 */
-	ASSERT(root->last_log_commit <= log_transid);
-	root->last_log_commit = log_transid;
+	ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+	btrfs_set_root_last_log_commit(root, log_transid);
 
 out_wake_log_root:
 	mutex_lock(&log_root_tree->log_mutex);
@@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
-			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+	extent_io_tree_release(&log->dirty_log_pages);
 	extent_io_tree_release(&log->log_csum_range);
 
 	btrfs_put_root(log);
@@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
 		last_offset = max(last_offset, curr_end);
 	}
 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 	btrfs_release_path(path);
 	return 0;
 }
@@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 
 	btrfs_set_token_timespec_sec(&token, &item->atime,
-				     inode->i_atime.tv_sec);
+				     inode_get_atime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->atime,
-				      inode->i_atime.tv_nsec);
+				      inode_get_atime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->mtime,
-				     inode->i_mtime.tv_sec);
+				     inode_get_mtime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
-				      inode->i_mtime.tv_nsec);
+				      inode_get_mtime_nsec(inode));
 
 	btrfs_set_token_timespec_sec(&token, &item->ctime,
-				     inode_get_ctime(inode).tv_sec);
+				     inode_get_ctime_sec(inode));
 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
-				      inode_get_ctime(inode).tv_nsec);
+				      inode_get_ctime_nsec(inode));
 
 	/*
 	 * We do not need to set the nbytes field, in fact during a fast fsync
@@ -4488,7 +4481,7 @@ copy_item:
 		dst_index++;
 	}
 
-	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
 	btrfs_release_path(dst_path);
 out:
 	kfree(ins_data);
@@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	write_extent_buffer(leaf, &fi,
 			    btrfs_item_ptr_offset(leaf, path->slots[0]),
 			    sizeof(fi));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	btrfs_release_path(path);
 
@@ -4921,12 +4914,12 @@ process:
 		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
 
 		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
-			spin_lock_irq(&inode->ordered_tree.lock);
+			spin_lock_irq(&inode->ordered_tree_lock);
 			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
 				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
 				atomic_inc(&trans->transaction->pending_ordered);
 			}
-			spin_unlock_irq(&inode->ordered_tree.lock);
+			spin_unlock_irq(&inode->ordered_tree_lock);
 		}
 		btrfs_put_ordered_extent(ordered);
 	}
@@ -7204,9 +7197,7 @@ again:
 			 * each subsequent pass.
 			 */
 			if (ret == -ENOENT)
-				ret = btrfs_pin_extent_for_log_replay(trans,
-							log->node->start,
-							log->node->len);
+				ret = btrfs_pin_extent_for_log_replay(trans, log->node);
 			btrfs_put_root(log);
 
 			if (!ret)
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 33606025513d..b4ac2b0cd235 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 }
 
 /*
- * ulist_del - delete one node from ulist
+ * Delete one node from ulist.
+ *
  * @ulist:	ulist to remove node from
  * @val:	value to delete
  * @aux:	aux to delete
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 7c7001f42b14..5be74f9e47eb 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 		 * An item with that type already exists.
 		 * Extend the item and store the new subid at the end.
 		 */
-		btrfs_extend_item(path, sizeof(subid_le));
+		btrfs_extend_item(trans, path, sizeof(subid_le));
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		offset = btrfs_item_ptr_offset(eb, slot);
@@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 	ret = 0;
 	subid_le = cpu_to_le64(subid_cpu);
 	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
@@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
 	move_src = offset + sizeof(subid);
 	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
 	memmove_extent_buffer(eb, move_dst, move_src, move_len);
-	btrfs_truncate_item(path, item_size - sizeof(subid), 1);
+	btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
 
 out:
 	btrfs_free_path(path);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 744f4f4d4c68..66e2270b0dae 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode)
 	}
 	inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
@@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
 	}
 	inode->ro_flags |= BTRFS_INODE_RO_VERITY;
 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
-	ret = btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, inode);
 	if (ret)
 		goto end_trans;
 	ret = del_orphan(trans, inode);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9ef6f54635c..f627674b37db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
 #include "relocation.h"
 #include "scrub.h"
 #include "super.h"
+#include "raid-stripe-tree.h"
 
 #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
 					 BTRFS_BLOCK_GROUP_RAID10 | \
@@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 }
 
 /*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid:		if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid:	if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid:    if not NULL, copy the UUID to fs_devices::fsid and to
+ *           fs_devices::metadata_fsid
  *
  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
  * The returned struct is not linked onto any lists and can be destroyed with
  * kfree() right away.
  */
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
-						 const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 {
 	struct btrfs_fs_devices *fs_devs;
 
-	ASSERT(fsid || !metadata_fsid);
-
 	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 	if (!fs_devs)
 		return ERR_PTR(-ENOMEM);
@@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 
 	if (fsid) {
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
-		memcpy(fs_devs->metadata_uuid,
-		       metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 	}
 
 	return fs_devs;
@@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid(
 	return NULL;
 }
 
-/*
- * First check if the metadata_uuid is different from the fsid in the given
- * fs_devices. Then check if the given fsid is the same as the metadata_uuid
- * in the fs_devices. If it is, return true; otherwise, return false.
- */
-static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
-				      const u8 *fsid)
-{
-	return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
-		      BTRFS_FSID_SIZE) != 0 &&
-	       memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
-}
-
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
-				struct btrfs_super_block *disk_super)
-{
-
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handle scanned device having completed its fsid change but
-	 * belonging to a fs_devices that was created by first scanning
-	 * a device which didn't have its fsid/metadata_uuid changed
-	 * at all and the CHANGING_FSID_V2 flag set.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
-					  fs_devices->fsid))
-			return fs_devices;
-	}
-
-	/*
-	 * Handle scanned device having completed its fsid change but
-	 * belonging to a fs_devices that was created by a device that
-	 * has an outdated pair of fsid/metadata_uuid and
-	 * CHANGING_FSID_V2 flag set.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
-			return fs_devices;
-	}
-
-	return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
-}
-
-
 static int
 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
-		      int flush, struct block_device **bdev,
+		      int flush, struct bdev_handle **bdev_handle,
 		      struct btrfs_super_block **disk_super)
 {
+	struct block_device *bdev;
 	int ret;
 
-	*bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
+	*bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
 
-	if (IS_ERR(*bdev)) {
-		ret = PTR_ERR(*bdev);
+	if (IS_ERR(*bdev_handle)) {
+		ret = PTR_ERR(*bdev_handle);
 		goto error;
 	}
+	bdev = (*bdev_handle)->bdev;
 
 	if (flush)
-		sync_blockdev(*bdev);
-	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
+		sync_blockdev(bdev);
+	ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
 	if (ret) {
-		blkdev_put(*bdev, holder);
+		bdev_release(*bdev_handle);
 		goto error;
 	}
-	invalidate_bdev(*bdev);
-	*disk_super = btrfs_read_dev_super(*bdev);
+	invalidate_bdev(bdev);
+	*disk_super = btrfs_read_dev_super(bdev);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		blkdev_put(*bdev, holder);
+		bdev_release(*bdev_handle);
 		goto error;
 	}
 
 	return 0;
 
 error:
-	*bdev = NULL;
+	*bdev_handle = NULL;
 	return ret;
 }
 
@@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 {
 	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 	struct btrfs_device *device, *tmp_device;
-	int ret = 0;
+	int ret;
+	bool freed = false;
 
 	lockdep_assert_held(&uuid_mutex);
 
-	if (devt)
-		ret = -ENOENT;
-
+	/* Return good status if there is no instance of devt. */
+	ret = 0;
 	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 
 		mutex_lock(&fs_devices->device_list_mutex);
@@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 			if (devt && devt != device->devt)
 				continue;
 			if (fs_devices->opened) {
-				/* for an already deleted device return 0 */
-				if (devt && ret != 0)
+				if (devt)
 					ret = -EBUSY;
 				break;
 			}
@@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 			list_del(&device->dev_list);
 			btrfs_free_device(device);
 
-			ret = 0;
+			freed = true;
 		}
 		mutex_unlock(&fs_devices->device_list_mutex);
 
@@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
 		}
 	}
 
+	/* If there is at least one freed device return 0. */
+	if (freed)
+		return 0;
+
 	return ret;
 }
 
+static struct btrfs_fs_devices *find_fsid_by_device(
+					struct btrfs_super_block *disk_super,
+					dev_t devt, bool *same_fsid_diff_dev)
+{
+	struct btrfs_fs_devices *fsid_fs_devices;
+	struct btrfs_fs_devices *devt_fs_devices;
+	const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+					BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+	bool found_by_devt = false;
+
+	/* Find the fs_device by the usual method, if found use it. */
+	fsid_fs_devices = find_fsid(disk_super->fsid,
+		    has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+	/* The temp_fsid feature is supported only with single device filesystem. */
+	if (btrfs_super_num_devices(disk_super) != 1)
+		return fsid_fs_devices;
+
+	/*
+	 * A seed device is an integral component of the sprout device, which
+	 * functions as a multi-device filesystem. So, temp-fsid feature is
+	 * not supported.
+	 */
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+		return fsid_fs_devices;
+
+	/* Try to find a fs_devices by matching devt. */
+	list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+		struct btrfs_device *device;
+
+		list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+			if (device->devt == devt) {
+				found_by_devt = true;
+				break;
+			}
+		}
+		if (found_by_devt)
+			break;
+	}
+
+	if (found_by_devt) {
+		/* Existing device. */
+		if (fsid_fs_devices == NULL) {
+			if (devt_fs_devices->opened == 0) {
+				/* Stale device. */
+				return NULL;
+			} else {
+				/* temp_fsid is mounting a subvol. */
+				return devt_fs_devices;
+			}
+		} else {
+			/* Regular or temp_fsid device mounting a subvol. */
+			return devt_fs_devices;
+		}
+	} else {
+		/* New device. */
+		if (fsid_fs_devices == NULL) {
+			return NULL;
+		} else {
+			/* sb::fsid is already used create a new temp_fsid. */
+			*same_fsid_diff_dev = true;
+			return NULL;
+		}
+	}
+
+	/* Not reached. */
+}
+
 /*
  * This is only used on mount, and we are protected from competing things
  * messing with our fs_devices by the uuid_mutex, thus we do not need the
@@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 			struct btrfs_device *device, blk_mode_t flags,
 			void *holder)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
 	int ret;
@@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		return -EINVAL;
 
 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-				    &bdev, &disk_super);
+				    &bdev_handle, &disk_super);
 	if (ret)
 		return ret;
 
@@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = true;
 	} else {
-		if (bdev_read_only(bdev))
+		if (bdev_read_only(bdev_handle->bdev))
 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		else
 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}
 
-	if (!bdev_nonrot(bdev))
+	if (!bdev_nonrot(bdev_handle->bdev))
 		fs_devices->rotating = true;
 
-	if (bdev_max_discard_sectors(bdev))
+	if (bdev_max_discard_sectors(bdev_handle->bdev))
 		fs_devices->discardable = true;
 
-	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
+	device->bdev = bdev_handle->bdev;
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	device->holder = holder;
 
 	fs_devices->open_devices++;
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,7 +695,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	blkdev_put(bdev, holder);
+	bdev_release(bdev_handle);
 
 	return -EINVAL;
 }
@@ -690,84 +709,6 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
 }
 
 /*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
-					struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices,  disk_super->fsid))
-			return fs_devices;
-	}
-
-	return find_fsid(disk_super->fsid, NULL);
-}
-
-static struct btrfs_fs_devices *find_fsid_changed(
-					struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handles the case where scanned device is part of an fs that had
-	 * multiple successful changes of FSID but currently device didn't
-	 * observe it. Meaning our fsid will be different than theirs. We need
-	 * to handle two subcases :
-	 *  1 - The fs still continues to have different METADATA/FSID uuids.
-	 *  2 - The fs is switched back to its original FSID (METADATA/FSID
-	 *  are equal).
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		/* Changed UUIDs */
-		if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
-		    memcmp(fs_devices->fsid, disk_super->fsid,
-			   BTRFS_FSID_SIZE) != 0)
-			return fs_devices;
-
-		/* Unchanged UUIDs */
-		if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
-			   BTRFS_FSID_SIZE) == 0 &&
-		    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
-			   BTRFS_FSID_SIZE) == 0)
-			return fs_devices;
-	}
-
-	return NULL;
-}
-
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
-				struct btrfs_super_block *disk_super)
-{
-	struct btrfs_fs_devices *fs_devices;
-
-	/*
-	 * Handle the case where the scanned device is part of an fs whose last
-	 * metadata UUID change reverted it to the original FSID. At the same
-	 * time fs_devices was first created by another constituent device
-	 * which didn't fully observe the operation. This results in an
-	 * btrfs_fs_devices created with metadata/fsid different AND
-	 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
-	 * fs_devices equal to the FSID of the disk.
-	 */
-	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
-		if (!fs_devices->fsid_change)
-			continue;
-
-		if (check_fsid_changed(fs_devices, disk_super->fsid))
-			return fs_devices;
-	}
-
-	return NULL;
-}
-/*
  * Add new device to list of registered devices
  *
  * Returns:
@@ -785,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 	dev_t path_devt;
 	int error;
+	bool same_fsid_diff_dev = false;
 	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
-	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
-					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+		btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+			  path);
+		return ERR_PTR(-EAGAIN);
+	}
 
 	error = lookup_bdev(path, &path_devt);
 	if (error) {
@@ -797,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		return ERR_PTR(error);
 	}
 
-	if (fsid_change_in_progress) {
-		if (!has_metadata_uuid)
-			fs_devices = find_fsid_inprogress(disk_super);
-		else
-			fs_devices = find_fsid_changed(disk_super);
-	} else if (has_metadata_uuid) {
-		fs_devices = find_fsid_with_metadata_uuid(disk_super);
-	} else {
-		fs_devices = find_fsid_reverted_metadata(disk_super);
-		if (!fs_devices)
-			fs_devices = find_fsid(disk_super->fsid, NULL);
-	}
-
+	fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
 
 	if (!fs_devices) {
-		fs_devices = alloc_fs_devices(disk_super->fsid,
-				has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+		fs_devices = alloc_fs_devices(disk_super->fsid);
 		if (IS_ERR(fs_devices))
 			return ERR_CAST(fs_devices);
 
-		fs_devices->fsid_change = fsid_change_in_progress;
+		if (has_metadata_uuid)
+			memcpy(fs_devices->metadata_uuid,
+			       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+		if (same_fsid_diff_dev) {
+			generate_random_uuid(fs_devices->fsid);
+			fs_devices->temp_fsid = true;
+			pr_info("BTRFS: device %s using temp-fsid %pU\n",
+				path, fs_devices->fsid);
+		}
 
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&fs_devices->fs_list, &fs_uuids);
@@ -832,18 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		mutex_lock(&fs_devices->device_list_mutex);
 		device = btrfs_find_device(fs_devices, &args);
 
-		/*
-		 * If this disk has been pulled into an fs devices created by
-		 * a device which had the CHANGING_FSID_V2 flag then replace the
-		 * metadata_uuid/fsid values of the fs_devices.
-		 */
-		if (fs_devices->fsid_change &&
-		    found_transid > fs_devices->latest_generation) {
+		if (found_transid > fs_devices->latest_generation) {
 			memcpy(fs_devices->fsid, disk_super->fsid,
 					BTRFS_FSID_SIZE);
 			memcpy(fs_devices->metadata_uuid,
 			       btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
-			fs_devices->fsid_change = false;
 		}
 	}
 
@@ -997,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 
 	lockdep_assert_held(&uuid_mutex);
 
-	fs_devices = alloc_fs_devices(orig->fsid, NULL);
+	fs_devices = alloc_fs_devices(orig->fsid);
 	if (IS_ERR(fs_devices))
 		return fs_devices;
 
@@ -1068,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
 			continue;
 
-		if (device->bdev) {
-			blkdev_put(device->bdev, device->holder);
+		if (device->bdev_handle) {
+			bdev_release(device->bdev_handle);
 			device->bdev = NULL;
+			device->bdev_handle = NULL;
 			fs_devices->open_devices--;
 		}
 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1115,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 		invalidate_bdev(device->bdev);
 	}
 
-	blkdev_put(device->bdev, device->holder);
+	bdev_release(device->bdev_handle);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1356,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt)
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
  */
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev)
 {
 	struct btrfs_super_block *disk_super;
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	u64 bytenr, bytenr_orig;
 	int ret;
 
@@ -1386,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
 	 * values temporarily, as the device paths of the fsid are the only
 	 * required information for assembling the volume.
 	 */
-	bdev = blkdev_get_by_path(path, flags, NULL, NULL);
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
+	bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
+	if (IS_ERR(bdev_handle))
+		return ERR_CAST(bdev_handle);
 
 	bytenr_orig = btrfs_sb_offset(0);
-	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+	ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
 	if (ret) {
 		device = ERR_PTR(ret);
 		goto error_bdev_put;
 	}
 
-	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+	disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+					   bytenr_orig);
 	if (IS_ERR(disk_super)) {
 		device = ERR_CAST(disk_super);
 		goto error_bdev_put;
 	}
 
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
+		dev_t devt;
+
+		ret = lookup_bdev(path, &devt);
+		if (ret)
+			btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+				   path, ret);
+		else
+			btrfs_free_stale_devices(devt, NULL);
+
+		pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
+		device = NULL;
+		goto free_disk_super;
+	}
+
 	device = device_list_add(path, disk_super, &new_device_added);
 	if (!IS_ERR(device) && new_device_added)
 		btrfs_free_stale_devices(device->devt, device);
 
+free_disk_super:
 	btrfs_release_disk_super(disk_super);
 
 error_bdev_put:
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 
 	return device;
 }
@@ -1894,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
 			    ptr, BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 	ret = 0;
 out:
@@ -2087,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct block_device **bdev, void **holder)
+		    struct bdev_handle **bdev_handle)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
@@ -2196,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 
 	btrfs_assign_next_active_device(device, NULL);
 
-	if (device->bdev) {
+	if (device->bdev_handle) {
 		cur_devices->open_devices--;
 		/* remove sysfs entry */
 		btrfs_sysfs_remove_device(device);
@@ -2212,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	 * free the device.
 	 *
 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
-	 * write lock, and blkdev_put() will pull in the ->open_mutex on the
-	 * block device and it's dependencies.  Instead just flush the device
-	 * and let the caller do the final blkdev_put.
+	 * write lock, and bdev_release() will pull in the ->open_mutex on
+	 * the block device and it's dependencies.  Instead just flush the
+	 * device and let the caller do the final bdev_release.
 	 */
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2225,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	*bdev = device->bdev;
-	*holder = device->holder;
+	*bdev_handle = device->bdev_handle;
 	synchronize_rcu();
 	btrfs_free_device(device);
 
@@ -2363,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 				 const char *path)
 {
 	struct btrfs_super_block *disk_super;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int ret;
 
 	if (!path || !path[0])
@@ -2381,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
-				    &bdev, &disk_super);
+				    &bdev_handle, &disk_super);
 	if (ret) {
 		btrfs_put_dev_args_from_path(args);
 		return ret;
@@ -2394,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	else
 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 	btrfs_release_disk_super(disk_super);
-	blkdev_put(bdev, NULL);
+	bdev_release(bdev_handle);
 	return 0;
 }
 
@@ -2451,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
 	 * Private copy of the seed devices, anchored at
 	 * fs_info->fs_devices->seed_list
 	 */
-	seed_devices = alloc_fs_devices(NULL, NULL);
+	seed_devices = alloc_fs_devices(NULL);
 	if (IS_ERR(seed_devices))
 		return seed_devices;
 
@@ -2597,7 +2556,7 @@ next_slot:
 		if (device->fs_devices->seeding) {
 			btrfs_set_device_generation(leaf, dev_item,
 						    device->generation);
-			btrfs_mark_buffer_dirty(leaf);
+			btrfs_mark_buffer_dirty(trans, leaf);
 		}
 
 		path->slots[0]++;
@@ -2614,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct super_block *sb = fs_info->sb;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *seed_devices = NULL;
@@ -2627,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (sb_rdonly(sb) && !fs_devices->seeding)
 		return -EROFS;
 
-	bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
-				  fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+					fs_info->bdev_holder, NULL);
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
 
-	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+	if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -2644,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		locked = true;
 	}
 
-	sync_blockdev(bdev);
+	sync_blockdev(bdev_handle->bdev);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
-		if (device->bdev == bdev) {
+		if (device->bdev == bdev_handle->bdev) {
 			ret = -EEXIST;
 			rcu_read_unlock();
 			goto error;
@@ -2664,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	device->fs_info = fs_info;
-	device->bdev = bdev;
+	device->bdev_handle = bdev_handle;
+	device->bdev = bdev_handle->bdev;
 	ret = lookup_bdev(device_path, &device->devt);
 	if (ret)
 		goto error_free_device;
@@ -2685,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->io_align = fs_info->sectorsize;
 	device->sector_size = fs_info->sectorsize;
 	device->total_bytes =
-		round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+		round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
 	device->disk_total_bytes = device->total_bytes;
 	device->commit_total_bytes = device->total_bytes;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
-	device->holder = fs_info->bdev_holder;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 
@@ -2726,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
 
-	if (!bdev_nonrot(bdev))
+	if (!bdev_nonrot(device->bdev))
 		fs_devices->rotating = true;
 
 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2848,7 +2807,7 @@ error_free_zone:
 error_free_device:
 	btrfs_free_device(device);
 error:
-	blkdev_put(bdev, fs_info->bdev_holder);
+	bdev_release(bdev_handle);
 	if (locked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -2895,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 				     btrfs_device_get_disk_total_bytes(device));
 	btrfs_set_device_bytes_used(leaf, dev_item,
 				    btrfs_device_get_bytes_used(device));
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 
 out:
 	btrfs_free_path(path);
@@ -2929,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	btrfs_set_super_total_bytes(super_copy,
 			round_down(old_total + diff, fs_info->sectorsize));
 	device->fs_devices->total_rw_bytes += diff;
+	atomic64_add(diff, &fs_info->free_chunk_space);
 
 	btrfs_device_set_total_bytes(device, new_size);
 	btrfs_device_set_disk_total_bytes(device, new_size);
@@ -3027,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 }
 
 /*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
  * @logical: Logical block offset in bytes.
  * @length: Length of extent in bytes.
  *
@@ -3045,15 +3006,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+		btrfs_crit(fs_info,
+			   "unable to find chunk map for logical %llu length %llu",
 			   logical, length);
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (em->start > logical || em->start + em->len < logical) {
+	if (em->start > logical || em->start + em->len <= logical) {
 		btrfs_crit(fs_info,
-			   "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
-			   logical, length, em->start, em->start + em->len);
+			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+			   logical, logical + length, em->start, em->start + em->len);
 		free_extent_map(em);
 		return ERR_PTR(-EINVAL);
 	}
@@ -3483,7 +3445,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
 
 	btrfs_set_balance_flags(leaf, item, bctl->flags);
 
-	btrfs_mark_buffer_dirty(leaf);
+	btrfs_mark_buffer_dirty(trans, leaf);
 out:
 	btrfs_free_path(path);
 	err = btrfs_commit_transaction(trans);
@@ -4838,6 +4800,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_size = btrfs_device_get_total_bytes(device);
 	u64 diff;
 	u64 start;
+	u64 free_diff = 0;
 
 	new_size = round_down(new_size, fs_info->sectorsize);
 	start = new_size;
@@ -4863,7 +4826,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	btrfs_device_set_total_bytes(device, new_size);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		device->fs_devices->total_rw_bytes -= diff;
-		atomic64_sub(diff, &fs_info->free_chunk_space);
+
+		/*
+		 * The new free_chunk_space is new_size - used, so we have to
+		 * subtract the delta of the old free_chunk_space which included
+		 * old_size - used.  If used > new_size then just subtract this
+		 * entire device's free space.
+		 */
+		if (device->bytes_used < new_size)
+			free_diff = (old_size - device->bytes_used) -
+				    (new_size - device->bytes_used);
+		else
+			free_diff = old_size - device->bytes_used;
+		atomic64_sub(free_diff, &fs_info->free_chunk_space);
 	}
 
 	/*
@@ -4998,9 +4973,10 @@ done:
 	if (ret) {
 		mutex_lock(&fs_info->chunk_mutex);
 		btrfs_device_set_total_bytes(device, old_size);
-		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 			device->fs_devices->total_rw_bytes += diff;
-		atomic64_add(diff, &fs_info->free_chunk_space);
+			atomic64_add(free_diff, &fs_info->free_chunk_space);
+		}
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
 	return ret;
@@ -5880,6 +5856,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 }
 
 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+						       u64 logical,
 						       u16 total_stripes)
 {
 	struct btrfs_io_context *bioc;
@@ -5899,6 +5876,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	bioc->fs_info = fs_info;
 	bioc->replace_stripe_src = -1;
 	bioc->full_stripe_logical = (u64)-1;
+	bioc->logical = logical;
 
 	return bioc;
 }
@@ -6203,12 +6181,20 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
 	return U64_MAX;
 }
 
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
-			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+			 u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+			 struct map_lookup *map, u32 stripe_index,
+			 u64 stripe_offset, u64 stripe_nr)
 {
 	dst->dev = map->stripes[stripe_index].dev;
+
+	if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+		return btrfs_get_raid_extent_offset(fs_info, logical, length,
+						    map->type, stripe_index, dst);
+
 	dst->physical = map->stripes[stripe_index].physical +
 			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+	return 0;
 }
 
 /*
@@ -6245,16 +6231,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
  *			For RAID6 profile, mirror > 2 means mark another
  *			data/P stripe error and rebuild from the remaining
  *			stripes..
- *
- * @need_raid_map:	(Used only for integrity checker) whether the map wants
- *                      a full stripe map (including all data and P/Q stripes)
- *                      for RAID56. Should always be 1 except integrity checker.
  */
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		    u64 logical, u64 *length,
 		    struct btrfs_io_context **bioc_ret,
-		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
-		    int need_raid_map)
+		    struct btrfs_io_stripe *smap, int *mirror_num_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -6349,8 +6330,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-		if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
+		if (op != BTRFS_MAP_READ || mirror_num > 1) {
 			/*
+			 * Needs full stripe mapping.
+			 *
 			 * Push stripe_nr back to the start of the full stripe
 			 * For those cases needing a full stripe, @stripe_nr
 			 * is the full stripe number.
@@ -6373,19 +6356,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 			stripe_index = 0;
 			stripe_offset = 0;
 		} else {
-			/*
-			 * Mirror #0 or #1 means the original data block.
-			 * Mirror #2 is RAID5 parity block.
-			 * Mirror #3 is RAID6 Q block.
-			 */
+			ASSERT(mirror_num <= 1);
+			/* Just grab the data stripe directly. */
 			stripe_index = stripe_nr % data_stripes;
 			stripe_nr /= data_stripes;
-			if (mirror_num > 1)
-				stripe_index = data_stripes + mirror_num - 2;
 
 			/* We distribute the parity blocks across stripes */
 			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
-			if (op == BTRFS_MAP_READ && mirror_num <= 1)
+			if (op == BTRFS_MAP_READ && mirror_num < 1)
 				mirror_num = 1;
 		}
 	} else {
@@ -6424,16 +6402,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 * I/O context structure.
 	 */
 	if (smap && num_alloc_stripes == 1 &&
+	    !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
+	      op != BTRFS_MAP_READ) &&
 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
-		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+		ret = set_io_stripe(fs_info, op, logical, length, smap, map,
+				    stripe_index, stripe_offset, stripe_nr);
 		if (mirror_num_ret)
 			*mirror_num_ret = mirror_num;
 		*bioc_ret = NULL;
-		ret = 0;
 		goto out;
 	}
 
-	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+	bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
 	if (!bioc) {
 		ret = -ENOMEM;
 		goto out;
@@ -6447,7 +6427,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 	 *
 	 * It's still mostly the same as other profiles, just with extra rotation.
 	 */
-	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
 	    (op != BTRFS_MAP_READ || mirror_num > 1)) {
 		/*
 		 * For RAID56 @stripe_nr is already the number of full stripes
@@ -6459,22 +6439,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		 */
 		bioc->full_stripe_logical = em->start +
 			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
-		for (i = 0; i < num_stripes; i++)
-			set_io_stripe(&bioc->stripes[i], map,
-				      (i + stripe_nr) % num_stripes,
-				      stripe_offset, stripe_nr);
+		for (int i = 0; i < num_stripes; i++) {
+			ret = set_io_stripe(fs_info, op, logical, length,
+					    &bioc->stripes[i], map,
+					    (i + stripe_nr) % num_stripes,
+					    stripe_offset, stripe_nr);
+			if (ret < 0)
+				break;
+		}
 	} else {
 		/*
 		 * For all other non-RAID56 profiles, just copy the target
 		 * stripe into the bioc.
 		 */
 		for (i = 0; i < num_stripes; i++) {
-			set_io_stripe(&bioc->stripes[i], map, stripe_index,
-				      stripe_offset, stripe_nr);
+			ret = set_io_stripe(fs_info, op, logical, length,
+					    &bioc->stripes[i], map, stripe_index,
+					    stripe_offset, stripe_nr);
+			if (ret < 0)
+				break;
 			stripe_index++;
 		}
 	}
 
+	if (ret) {
+		*bioc_ret = NULL;
+		btrfs_put_bioc(bioc);
+		goto out;
+	}
+
 	if (op != BTRFS_MAP_READ)
 		max_errors = btrfs_chunk_max_errors(map);
 
@@ -6901,7 +6894,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 		if (!btrfs_test_opt(fs_info, DEGRADED))
 			return ERR_PTR(-ENOENT);
 
-		fs_devices = alloc_fs_devices(fsid, NULL);
+		fs_devices = alloc_fs_devices(fsid);
 		if (IS_ERR(fs_devices))
 			return fs_devices;
 
@@ -7534,7 +7527,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
 	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
 		btrfs_set_dev_stats_value(eb, ptr, i,
 					  btrfs_dev_stat_read(device, i));
-	btrfs_mark_buffer_dirty(eb);
+	btrfs_mark_buffer_dirty(trans, eb);
 
 out:
 	btrfs_free_path(path);
@@ -8076,7 +8069,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
 	ASSERT(mirror_num > 0);
 
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
-			      &bioc, smap, &mirror_ret, true);
+			      &bioc, smap, &mirror_ret);
 	if (ret < 0)
 		return ret;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2128a032c3b7..9cc374864a79 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,13 +90,11 @@ struct btrfs_device {
 
 	u64 generation;
 
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 
 	struct btrfs_zoned_device_info *zone_info;
 
-	/* block device holder for blkdev_get/put */
-	void *holder;
-
 	/*
 	 * Device's major-minor number. Must be set even if the device is not
 	 * opened (bdev == NULL), unless the device is missing.
@@ -290,6 +288,19 @@ struct btrfs_fs_devices {
 	 * - Following shall be true at all times:
 	 *   - metadata_uuid == btrfs_header::fsid
 	 *   - metadata_uuid == btrfs_dev_item::fsid
+	 *
+	 * - Relations between fsid and metadata_uuid in sb and fs_devices:
+	 *   - Normal:
+	 *       fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
+	 *       sb->metadata_uuid == 0
+	 *
+	 *   - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
+	 *       fs_devices->fsid == sb->fsid
+	 *       fs_devices->metadata_uuid == sb->metadata_uuid
+	 *
+	 *   - When in-memory fs_devices->temp_fsid is true
+	 *	 fs_devices->fsid = random
+	 *	 fs_devices->metadata_uuid == sb->fsid
 	 */
 	u8 metadata_uuid[BTRFS_FSID_SIZE];
 
@@ -353,9 +364,10 @@ struct btrfs_fs_devices {
 	bool rotating;
 	/* Devices support TRIM/discard commands. */
 	bool discardable;
-	bool fsid_change;
 	/* The filesystem is a seed filesystem. */
 	bool seeding;
+	/* The mount needs to use a randomly generated fsid. */
+	bool temp_fsid;
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
@@ -381,12 +393,12 @@ struct btrfs_fs_devices {
 
 struct btrfs_io_stripe {
 	struct btrfs_device *dev;
-	union {
-		/* Block mapping */
-		u64 physical;
-		/* For the endio handler */
-		struct btrfs_io_context *bioc;
-	};
+	/* Block mapping. */
+	u64 physical;
+	u64 length;
+	bool is_scrub;
+	/* For the endio handler. */
+	struct btrfs_io_context *bioc;
 };
 
 struct btrfs_discard_stripe {
@@ -419,6 +431,11 @@ struct btrfs_io_context {
 	atomic_t error;
 	u16 max_errors;
 
+	u64 logical;
+	u64 size;
+	/* Raid stripe tree ordered entry. */
+	struct list_head rst_ordered_entry;
+
 	/*
 	 * The total number of stripes, including the extra duplicated
 	 * stripe for replace.
@@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
 		    u64 logical, u64 *length,
 		    struct btrfs_io_context **bioc_ret,
-		    struct btrfs_io_stripe *smap, int *mirror_num_ret,
-		    int need_raid_map);
+		    struct btrfs_io_stripe *smap, int *mirror_num_ret);
 int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
 			   struct btrfs_io_stripe *smap, u64 logical,
 			   u32 length, int mirror_num);
@@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+					   bool mount_arg_dev);
 int btrfs_forget_devices(dev_t devt);
 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct block_device **bdev, void **holder);
+		    struct bdev_handle **bdev_handle);
 void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 96828a13dd43..3cf236fb40a4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		if (old_data_len + name_len + sizeof(*di) == item_size) {
 			/* No other xattrs packed in the same leaf item. */
 			if (size > old_data_len)
-				btrfs_extend_item(path, size - old_data_len);
+				btrfs_extend_item(trans, path, size - old_data_len);
 			else if (size < old_data_len)
-				btrfs_truncate_item(path, data_size, 1);
+				btrfs_truncate_item(trans, path, data_size, 1);
 		} else {
 			/* There are other xattrs packed in the same item. */
 			ret = btrfs_delete_one_dir_name(trans, root, path, di);
 			if (ret)
 				goto out;
-			btrfs_extend_item(path, data_size);
+			btrfs_extend_item(trans, path, data_size);
 		}
 
 		ptr = btrfs_item_ptr(leaf, slot, char);
@@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
 		btrfs_set_dir_data_len(leaf, di, size);
 		data_ptr = ((unsigned long)(di + 1)) + name_len;
 		write_extent_buffer(leaf, value, data_ptr, size);
-		btrfs_mark_buffer_dirty(leaf);
+		btrfs_mark_buffer_dirty(trans, leaf);
 	} else {
 		/*
 		 * Insert, and we had space for the xattr, so path->slots[0] is
@@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
 
 	inode_inc_iversion(inode);
 	inode_set_ctime_current(inode);
-	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
 out:
@@ -408,7 +408,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
 	if (!ret) {
 		inode_inc_iversion(inode);
 		inode_set_ctime_current(inode);
-		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
 	}
@@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = {
 	.set = btrfs_xattr_handler_set_prop,
 };
 
-const struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler * const btrfs_xattr_handlers[] = {
 	&btrfs_security_xattr_handler,
 	&btrfs_trusted_xattr_handler,
 	&btrfs_user_xattr_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 1cd3fc0a8f17..118118ca3e1d 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -8,7 +8,7 @@
 
 #include <linux/xattr.h>
 
-extern const struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler * const btrfs_xattr_handlers[];
 
 int btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..188378ca19c7 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
 	return ret;
 }
 
+struct zone_info {
+	u64 physical;
+	u64 capacity;
+	u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+				struct zone_info *info, unsigned long *active,
+				struct map_lookup *map)
+{
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	struct btrfs_device *device = map->stripes[zone_idx].dev;
+	int dev_replace_is_ongoing = 0;
+	unsigned int nofs_flag;
+	struct blk_zone zone;
+	int ret;
+
+	info->physical = map->stripes[zone_idx].physical;
+
+	if (!device->bdev) {
+		info->alloc_offset = WP_MISSING_DEV;
+		return 0;
+	}
+
+	/* Consider a zone as active if we can allow any number of active zones. */
+	if (!device->zone_info->max_active_zones)
+		__set_bit(zone_idx, active);
+
+	if (!btrfs_dev_is_sequential(device, info->physical)) {
+		info->alloc_offset = WP_CONVENTIONAL;
+		return 0;
+	}
+
+	/* This zone will be used for allocation, so mark this zone non-empty. */
+	btrfs_dev_clear_zone_empty(device, info->physical);
+
+	down_read(&dev_replace->rwsem);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+		btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+	up_read(&dev_replace->rwsem);
+
+	/*
+	 * The group is mapped to a sequential zone. Get the zone write pointer
+	 * to determine the allocation offset within the zone.
+	 */
+	WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+	nofs_flag = memalloc_nofs_save();
+	ret = btrfs_get_dev_zone(device, info->physical, &zone);
+	memalloc_nofs_restore(nofs_flag);
+	if (ret) {
+		if (ret != -EIO && ret != -EOPNOTSUPP)
+			return ret;
+		info->alloc_offset = WP_MISSING_DEV;
+		return 0;
+	}
+
+	if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		btrfs_err_in_rcu(fs_info,
+		"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+			zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+			device->devid);
+		return -EIO;
+	}
+
+	info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+	switch (zone.cond) {
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+		btrfs_err(fs_info,
+		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
+			  (info->physical >> device->zone_info->zone_size_shift),
+			  rcu_str_deref(device->name), device->devid);
+		info->alloc_offset = WP_MISSING_DEV;
+		break;
+	case BLK_ZONE_COND_EMPTY:
+		info->alloc_offset = 0;
+		break;
+	case BLK_ZONE_COND_FULL:
+		info->alloc_offset = info->capacity;
+		break;
+	default:
+		/* Partially used zone. */
+		info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+		__set_bit(zone_idx, active);
+		break;
+	}
+
+	return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+					 struct zone_info *info,
+					 unsigned long *active)
+{
+	if (info->alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			"zoned: cannot recover write pointer for zone %llu",
+			info->physical);
+		return -EIO;
+	}
+
+	bg->alloc_offset = info->alloc_offset;
+	bg->zone_capacity = info->capacity;
+	if (test_bit(0, active))
+		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+	return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+				      struct map_lookup *map,
+				      struct zone_info *zone_info,
+				      unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+		return -EINVAL;
+	}
+
+	if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			  "zoned: cannot recover write pointer for zone %llu",
+			  zone_info[0].physical);
+		return -EIO;
+	}
+	if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+		btrfs_err(bg->fs_info,
+			  "zoned: cannot recover write pointer for zone %llu",
+			  zone_info[1].physical);
+		return -EIO;
+	}
+	if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+		btrfs_err(bg->fs_info,
+			  "zoned: write pointer offset mismatch of zones in DUP profile");
+		return -EIO;
+	}
+
+	if (test_bit(0, active) != test_bit(1, active)) {
+		if (!btrfs_zone_activate(bg))
+			return -EIO;
+	} else if (test_bit(0, active)) {
+		set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+	}
+
+	bg->alloc_offset = zone_info[0].alloc_offset;
+	bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+	return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+					struct map_lookup *map,
+					struct zone_info *zone_info,
+					unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+	int i;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+		    !btrfs_test_opt(fs_info, DEGRADED)) {
+			btrfs_err(fs_info,
+			"zoned: write pointer offset mismatch of zones in %s profile",
+				  btrfs_bg_type_to_raid_name(map->type));
+			return -EIO;
+		}
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_test_opt(fs_info, DEGRADED) &&
+			    !btrfs_zone_activate(bg)) {
+				return -EIO;
+			}
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+		/* In case a device is missing we have a cap of 0, so don't use it. */
+		bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+						 zone_info[1].capacity);
+	}
+
+	if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+		bg->alloc_offset = zone_info[0].alloc_offset;
+	else
+		bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+	return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+					struct map_lookup *map,
+					struct zone_info *zone_info,
+					unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (int i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_zone_activate(bg))
+				return -EIO;
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+		bg->zone_capacity += zone_info[i].capacity;
+		bg->alloc_offset += zone_info[i].alloc_offset;
+	}
+
+	return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+					 struct map_lookup *map,
+					 struct zone_info *zone_info,
+					 unsigned long *active)
+{
+	struct btrfs_fs_info *fs_info = bg->fs_info;
+
+	if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+		btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+			  btrfs_bg_type_to_raid_name(map->type));
+		return -EINVAL;
+	}
+
+	for (int i = 0; i < map->num_stripes; i++) {
+		if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+		    zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			continue;
+
+		if (test_bit(0, active) != test_bit(i, active)) {
+			if (!btrfs_zone_activate(bg))
+				return -EIO;
+		} else {
+			if (test_bit(0, active))
+				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+		}
+
+		if ((i % map->sub_stripes) == 0) {
+			bg->zone_capacity += zone_info[i].capacity;
+			bg->alloc_offset += zone_info[i].alloc_offset;
+		}
+	}
+
+	return 0;
+}
+
 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
-	struct btrfs_device *device;
 	u64 logical = cache->start;
 	u64 length = cache->length;
+	struct zone_info *zone_info = NULL;
 	int ret;
 	int i;
-	unsigned int nofs_flag;
-	u64 *alloc_offsets = NULL;
-	u64 *caps = NULL;
-	u64 *physical = NULL;
 	unsigned long *active = NULL;
 	u64 last_alloc = 0;
 	u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		goto out;
 	}
 
-	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
-	if (!alloc_offsets) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
-	if (!caps) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
-	if (!physical) {
+	zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+	if (!zone_info) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 	for (i = 0; i < map->num_stripes; i++) {
-		bool is_sequential;
-		struct blk_zone zone;
-		struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-		int dev_replace_is_ongoing = 0;
-
-		device = map->stripes[i].dev;
-		physical[i] = map->stripes[i].physical;
-
-		if (device->bdev == NULL) {
-			alloc_offsets[i] = WP_MISSING_DEV;
-			continue;
-		}
-
-		is_sequential = btrfs_dev_is_sequential(device, physical[i]);
-		if (is_sequential)
-			num_sequential++;
-		else
-			num_conventional++;
-
-		/*
-		 * Consider a zone as active if we can allow any number of
-		 * active zones.
-		 */
-		if (!device->zone_info->max_active_zones)
-			__set_bit(i, active);
-
-		if (!is_sequential) {
-			alloc_offsets[i] = WP_CONVENTIONAL;
-			continue;
-		}
-
-		/*
-		 * This zone will be used for allocation, so mark this zone
-		 * non-empty.
-		 */
-		btrfs_dev_clear_zone_empty(device, physical[i]);
-
-		down_read(&dev_replace->rwsem);
-		dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
-		if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
-			btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
-		up_read(&dev_replace->rwsem);
-
-		/*
-		 * The group is mapped to a sequential zone. Get the zone write
-		 * pointer to determine the allocation offset within the zone.
-		 */
-		WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
-		nofs_flag = memalloc_nofs_save();
-		ret = btrfs_get_dev_zone(device, physical[i], &zone);
-		memalloc_nofs_restore(nofs_flag);
-		if (ret == -EIO || ret == -EOPNOTSUPP) {
-			ret = 0;
-			alloc_offsets[i] = WP_MISSING_DEV;
-			continue;
-		} else if (ret) {
-			goto out;
-		}
-
-		if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
-			btrfs_err_in_rcu(fs_info,
-	"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
-				zone.start << SECTOR_SHIFT,
-				rcu_str_deref(device->name), device->devid);
-			ret = -EIO;
+		ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+		if (ret)
 			goto out;
-		}
 
-		caps[i] = (zone.capacity << SECTOR_SHIFT);
-
-		switch (zone.cond) {
-		case BLK_ZONE_COND_OFFLINE:
-		case BLK_ZONE_COND_READONLY:
-			btrfs_err(fs_info,
-		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
-				  physical[i] >> device->zone_info->zone_size_shift,
-				  rcu_str_deref(device->name), device->devid);
-			alloc_offsets[i] = WP_MISSING_DEV;
-			break;
-		case BLK_ZONE_COND_EMPTY:
-			alloc_offsets[i] = 0;
-			break;
-		case BLK_ZONE_COND_FULL:
-			alloc_offsets[i] = caps[i];
-			break;
-		default:
-			/* Partially used zone */
-			alloc_offsets[i] =
-					((zone.wp - zone.start) << SECTOR_SHIFT);
-			__set_bit(i, active);
-			break;
-		}
+		if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+			num_conventional++;
+		else
+			num_sequential++;
 	}
 
 	if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 
 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 	case 0: /* single */
-		if (alloc_offsets[0] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[0]);
-			ret = -EIO;
-			goto out;
-		}
-		cache->alloc_offset = alloc_offsets[0];
-		cache->zone_capacity = caps[0];
-		if (test_bit(0, active))
-			set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+		ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
 		break;
 	case BTRFS_BLOCK_GROUP_DUP:
-		if (map->type & BTRFS_BLOCK_GROUP_DATA) {
-			btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
-			ret = -EINVAL;
-			goto out;
-		}
-		if (alloc_offsets[0] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[0]);
-			ret = -EIO;
-			goto out;
-		}
-		if (alloc_offsets[1] == WP_MISSING_DEV) {
-			btrfs_err(fs_info,
-			"zoned: cannot recover write pointer for zone %llu",
-				physical[1]);
-			ret = -EIO;
-			goto out;
-		}
-		if (alloc_offsets[0] != alloc_offsets[1]) {
-			btrfs_err(fs_info,
-			"zoned: write pointer offset mismatch of zones in DUP profile");
-			ret = -EIO;
-			goto out;
-		}
-		if (test_bit(0, active) != test_bit(1, active)) {
-			if (!btrfs_zone_activate(cache)) {
-				ret = -EIO;
-				goto out;
-			}
-		} else {
-			if (test_bit(0, active))
-				set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
-					&cache->runtime_flags);
-		}
-		cache->alloc_offset = alloc_offsets[0];
-		cache->zone_capacity = min(caps[0], caps[1]);
+		ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
 		break;
 	case BTRFS_BLOCK_GROUP_RAID1:
+	case BTRFS_BLOCK_GROUP_RAID1C3:
+	case BTRFS_BLOCK_GROUP_RAID1C4:
+		ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID0:
+		ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID10:
+		ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+		break;
 	case BTRFS_BLOCK_GROUP_RAID5:
 	case BTRFS_BLOCK_GROUP_RAID6:
-		/* non-single profiles are not supported yet */
 	default:
 		btrfs_err(fs_info, "zoned: profile %s not yet supported",
 			  btrfs_bg_type_to_raid_name(map->type));
@@ -1533,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 	}
 
 out:
-	if (cache->alloc_offset > fs_info->zone_size) {
-		btrfs_err(fs_info,
-			"zoned: invalid write pointer %llu in block group %llu",
-			cache->alloc_offset, cache->start);
-		ret = -EIO;
-	}
-
 	if (cache->alloc_offset > cache->zone_capacity) {
 		btrfs_err(fs_info,
 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1570,9 +1691,7 @@ out:
 		cache->physical_map = NULL;
 	}
 	bitmap_free(active);
-	kfree(physical);
-	kfree(caps);
-	kfree(alloc_offsets);
+	kfree(zone_info);
 	free_extent_map(em);
 
 	return ret;
@@ -1609,7 +1728,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
 	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
 	set_extent_buffer_dirty(eb);
 	set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
-			EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+			EXTENT_DIRTY, NULL);
 }
 
 bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2006,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
 	int i, ret;
 
 	ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-			      &mapped_length, &bioc, NULL, NULL, 1);
+			      &mapped_length, &bioc, NULL, NULL);
 	if (ret || !bioc || mapped_length < PAGE_SIZE) {
 		ret = -EIO;
 		goto out_put_bioc;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e7ac4ec809a4..5511766485cd 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
 }
 
 /*
- * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ * Calculate monotonic memory bounds.
  *
  * It is possible based on the level configurations that a higher level
  * workspace uses less memory than a lower level workspace.  In order to reuse
@@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void)
 }
 
 /*
- * zstd_find_workspace - find workspace
+ * Find workspace for given level.
+ *
  * @level: compression level
  *
  * This iterates over the set bits in the active_map beginning at the requested
@@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level)
 }
 
 /*
- * zstd_get_workspace - zstd's get_workspace
+ * Zstd get_workspace for level.
+ *
  * @level: compression level
  *
  * If @level is 0, then any compression level can be used.  Therefore, we begin
@@ -296,7 +298,8 @@ again:
 }
 
 /*
- * zstd_put_workspace - zstd put_workspace
+ * Zstd put_workspace.
+ *
  * @ws: list_head for the workspace
  *
  * When putting back a workspace, we only need to update the LRU if we are of
diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71c693d..967f34b70aa8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	} while (tmp != bh);
 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
-	/*
-	 * If all of the buffers are uptodate then we can set the page
-	 * uptodate.
-	 */
-	if (folio_uptodate)
-		folio_mark_uptodate(folio);
-	folio_unlock(folio);
+	folio_end_read(folio, folio_uptodate);
 	return;
 
 still_busy:
@@ -915,16 +909,12 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
-					bool retry)
+					gfp_t gfp)
 {
 	struct buffer_head *bh, *head;
-	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 	long offset;
 	struct mem_cgroup *memcg, *old_memcg;
 
-	if (retry)
-		gfp |= __GFP_NOFAIL;
-
 	/* The folio lock pins the memcg */
 	memcg = folio_memcg(folio);
 	old_memcg = set_active_memcg(memcg);
@@ -967,7 +957,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 				       bool retry)
 {
-	return folio_alloc_buffers(page_folio(page), size, retry);
+	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
+	return folio_alloc_buffers(page_folio(page), size, gfp);
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
@@ -1043,20 +1037,11 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	struct buffer_head *bh;
 	sector_t end_block;
 	int ret = 0;
-	gfp_t gfp_mask;
-
-	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
-
-	/*
-	 * XXX: __getblk_slow() can not really deal with failure and
-	 * will endlessly loop on improvised global reclaim.  Prefer
-	 * looping in the allocator rather than here, at least that
-	 * code knows what it's doing.
-	 */
-	gfp_mask |= __GFP_NOFAIL;
 
 	folio = __filemap_get_folio(inode->i_mapping, index,
-			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
 	bh = folio_buffers(folio);
 	if (bh) {
@@ -1069,7 +1054,10 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 			goto failed;
 	}
 
-	bh = folio_alloc_buffers(folio, size, true);
+	ret = -ENOMEM;
+	bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+	if (!bh)
+		goto failed;
 
 	/*
 	 * Link the folio to the buffers and initialise them.  Take the
@@ -1420,33 +1408,36 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
 }
 EXPORT_SYMBOL(__find_get_block);
 
-/*
- * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- * which corresponds to the passed block_device, block and size. The
- * returned buffer has its reference count incremented.
+/**
+ * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ * @bdev: The block device.
+ * @block: The block number.
+ * @size: The size of buffer_heads for this @bdev.
+ * @gfp: The memory allocation flags to use.
  *
- * __getblk_gfp() will lock up the machine if grow_dev_page's
- * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
+ * Return: The buffer head, or NULL if memory could not be allocated.
  */
-struct buffer_head *
-__getblk_gfp(struct block_device *bdev, sector_t block,
-	     unsigned size, gfp_t gfp)
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp)
 {
 	struct buffer_head *bh = __find_get_block(bdev, block, size);
 
-	might_sleep();
-	if (bh == NULL)
-		bh = __getblk_slow(bdev, block, size, gfp);
-	return bh;
+	might_alloc(gfp);
+	if (bh)
+		return bh;
+
+	return __getblk_slow(bdev, block, size, gfp);
 }
-EXPORT_SYMBOL(__getblk_gfp);
+EXPORT_SYMBOL(bdev_getblk);
 
 /*
  * Do async read-ahead on a buffer..
  */
 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 {
-	struct buffer_head *bh = __getblk(bdev, block, size);
+	struct buffer_head *bh = bdev_getblk(bdev, block, size,
+			GFP_NOWAIT | __GFP_MOVABLE);
+
 	if (likely(bh)) {
 		bh_readahead(bh, REQ_RAHEAD);
 		brelse(bh);
@@ -1470,7 +1461,17 @@ struct buffer_head *
 __bread_gfp(struct block_device *bdev, sector_t block,
 		   unsigned size, gfp_t gfp)
 {
-	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+	struct buffer_head *bh;
+
+	gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+
+	/*
+	 * Prefer looping in the allocator rather than here, at least that
+	 * code knows what it's doing.
+	 */
+	gfp |= __GFP_NOFAIL;
+
+	bh = bdev_getblk(bdev, block, size, gfp);
 
 	if (likely(bh) && !buffer_uptodate(bh))
 		bh = __bread_slow(bh);
@@ -1640,12 +1641,13 @@ EXPORT_SYMBOL(block_invalidate_folio);
  * block_dirty_folio() via private_lock.  try_to_free_buffers
  * is already excluded via the folio lock.
  */
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
-				unsigned long b_state)
+struct buffer_head *create_empty_buffers(struct folio *folio,
+		unsigned long blocksize, unsigned long b_state)
 {
 	struct buffer_head *bh, *head, *tail;
+	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
 
-	head = folio_alloc_buffers(folio, blocksize, true);
+	head = folio_alloc_buffers(folio, blocksize, gfp);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -1667,13 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
 	}
 	folio_attach_private(folio, head);
 	spin_unlock(&folio->mapping->private_lock);
-}
-EXPORT_SYMBOL(folio_create_empty_buffers);
 
-void create_empty_buffers(struct page *page,
-			unsigned long blocksize, unsigned long b_state)
-{
-	folio_create_empty_buffers(page_folio(page), blocksize, b_state);
+	return head;
 }
 EXPORT_SYMBOL(create_empty_buffers);
 
@@ -1768,13 +1765,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
 						struct inode *inode,
 						unsigned int b_state)
 {
+	struct buffer_head *bh;
+
 	BUG_ON(!folio_test_locked(folio));
 
-	if (!folio_buffers(folio))
-		folio_create_empty_buffers(folio,
-					   1 << READ_ONCE(inode->i_blkbits),
-					   b_state);
-	return folio_buffers(folio);
+	bh = folio_buffers(folio);
+	if (!bh)
+		bh = create_empty_buffers(folio,
+				1 << READ_ONCE(inode->i_blkbits), b_state);
+	return bh;
 }
 
 /*
@@ -2425,12 +2424,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 
 	if (!nr) {
 		/*
-		 * All buffers are uptodate - we can set the folio uptodate
-		 * as well. But not if get_block() returned an error.
+		 * All buffers are uptodate or get_block() returned an
+		 * error when trying to map them - we can finish the read.
 		 */
-		if (!page_error)
-			folio_mark_uptodate(folio);
-		folio_unlock(folio);
+		folio_end_read(folio, !page_error);
 		return 0;
 	}
 
@@ -2676,10 +2673,8 @@ int block_truncate_page(struct address_space *mapping,
 		return PTR_ERR(folio);
 
 	bh = folio_buffers(folio);
-	if (!bh) {
-		folio_create_empty_buffers(folio, blocksize, 0);
-		bh = folio_buffers(folio);
-	}
+	if (!bh)
+		bh = create_empty_buffers(folio, blocksize, 0);
 
 	/* Find the buffer that contains "offset" */
 	offset = offset_in_folio(folio, from);
@@ -2988,13 +2983,13 @@ EXPORT_SYMBOL(try_to_free_buffers);
 /*
  * Buffer-head allocation
  */
-static struct kmem_cache *bh_cachep __read_mostly;
+static struct kmem_cache *bh_cachep __ro_after_init;
 
 /*
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
-static unsigned long max_buffer_heads;
+static unsigned long max_buffer_heads __ro_after_init;
 
 int buffer_heads_over_limit;
 
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index c53a1d220622..1564eacc253d 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 
 #include "super.h"
+#include "mds_client.h"
 
 static inline void ceph_set_cached_acl(struct inode *inode,
 					int type, struct posix_acl *acl)
@@ -31,6 +32,7 @@ static inline void ceph_set_cached_acl(struct inode *inode,
 
 struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int size;
 	unsigned int retry_cnt = 0;
 	const char *name;
@@ -72,8 +74,8 @@ retry:
 	} else if (size == -ENODATA || size == 0) {
 		acl = NULL;
 	} else {
-		pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n",
-				   ceph_vinop(inode), size);
+		pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n",
+					  ceph_vinop(inode), size);
 		acl = ERR_PTR(-EIO);
 	}
 
@@ -105,7 +107,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	case ACL_TYPE_ACCESS:
 		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
-			ret = posix_acl_update_mode(&nop_mnt_idmap, inode,
+			ret = posix_acl_update_mode(idmap, inode,
 						    &new_mode, &acl);
 			if (ret)
 				goto out;
@@ -140,7 +142,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 		newattrs.ia_ctime = current_time(inode);
 		newattrs.ia_mode = new_mode;
 		newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-		ret = __ceph_setattr(inode, &newattrs, NULL);
+		ret = __ceph_setattr(idmap, inode, &newattrs, NULL);
 		if (ret)
 			goto out_free;
 	}
@@ -151,7 +153,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 			newattrs.ia_ctime = old_ctime;
 			newattrs.ia_mode = old_mode;
 			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-			__ceph_setattr(inode, &newattrs, NULL);
+			__ceph_setattr(idmap, inode, &newattrs, NULL);
 		}
 		goto out_free;
 	}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f4863078f7fe..85be3bf18cdf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -79,18 +79,18 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
  */
 static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 {
-	struct inode *inode;
+	struct inode *inode = mapping->host;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci;
 	struct ceph_snap_context *snapc;
 
 	if (folio_test_dirty(folio)) {
-		dout("%p dirty_folio %p idx %lu -- already dirty\n",
-		     mapping->host, folio, folio->index);
+		doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
+		      ceph_vinop(inode), folio, folio->index);
 		VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
 		return false;
 	}
 
-	inode = mapping->host;
 	ci = ceph_inode(inode);
 
 	/* dirty the head */
@@ -111,12 +111,12 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 	if (ci->i_wrbuffer_ref == 0)
 		ihold(inode);
 	++ci->i_wrbuffer_ref;
-	dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
-	     "snapc %p seq %lld (%d snaps)\n",
-	     mapping->host, folio, folio->index,
-	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
-	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
-	     snapc, snapc->seq, snapc->num_snaps);
+	doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
+	      "snapc %p seq %lld (%d snaps)\n",
+	      ceph_vinop(inode), folio, folio->index,
+	      ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	      ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	      snapc, snapc->seq, snapc->num_snaps);
 	spin_unlock(&ci->i_ceph_lock);
 
 	/*
@@ -137,23 +137,22 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 static void ceph_invalidate_folio(struct folio *folio, size_t offset,
 				size_t length)
 {
-	struct inode *inode;
-	struct ceph_inode_info *ci;
+	struct inode *inode = folio->mapping->host;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc;
 
-	inode = folio->mapping->host;
-	ci = ceph_inode(inode);
 
 	if (offset != 0 || length != folio_size(folio)) {
-		dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
-		     inode, folio->index, offset, length);
+		doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
+		      ceph_vinop(inode), folio->index, offset, length);
 		return;
 	}
 
 	WARN_ON(!folio_test_locked(folio));
 	if (folio_test_private(folio)) {
-		dout("%p invalidate_folio idx %lu full dirty page\n",
-		     inode, folio->index);
+		doutc(cl, "%llx.%llx idx %lu full dirty page\n",
+		      ceph_vinop(inode), folio->index);
 
 		snapc = folio_detach_private(folio);
 		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
@@ -166,10 +165,10 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
 static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
 {
 	struct inode *inode = folio->mapping->host;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 
-	dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
-	     ceph_vinop(inode),
-	     folio->index, folio_test_dirty(folio) ? "" : "not ");
+	doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
+	      folio->index, folio_test_dirty(folio) ? "" : "not ");
 
 	if (folio_test_private(folio))
 		return false;
@@ -229,7 +228,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
 static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
 {
 	struct inode *inode = subreq->rreq->inode;
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 objno, objoff;
 	u32 xlen;
@@ -244,7 +243,8 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
 static void finish_netfs_read(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
 	struct netfs_io_subrequest *subreq = req->r_priv;
 	struct ceph_osd_req_op *op = &req->r_ops[0];
@@ -254,8 +254,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 	ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				 req->r_end_latency, osd_data->length, err);
 
-	dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
-	     subreq->len, i_size_read(req->r_inode));
+	doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
+	      subreq->len, i_size_read(req->r_inode));
 
 	/* no object means success but no data */
 	if (err == -ENOENT)
@@ -348,7 +348,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	struct netfs_io_request *rreq = subreq->rreq;
 	struct inode *inode = rreq->inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_request *req = NULL;
 	struct ceph_vino vino = ceph_vino(inode);
 	struct iov_iter iter;
@@ -383,7 +384,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 			goto out;
 	}
 
-	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
+	      ceph_vinop(inode), subreq->start, subreq->len, len);
 
 	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
 
@@ -400,8 +402,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 
 		err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
 		if (err < 0) {
-			dout("%s: iov_ter_get_pages_alloc returned %d\n",
-			     __func__, err);
+			doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
+			      ceph_vinop(inode), err);
 			goto out;
 		}
 
@@ -429,12 +431,13 @@ out:
 	ceph_osdc_put_request(req);
 	if (err)
 		netfs_subreq_terminated(subreq, err, false);
-	dout("%s: result %d\n", __func__, err);
+	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
 }
 
 static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
 {
 	struct inode *inode = rreq->inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int got = 0, want = CEPH_CAP_FILE_CACHE;
 	struct ceph_netfs_request_data *priv;
 	int ret = 0;
@@ -466,12 +469,12 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
 	 */
 	ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
 	if (ret < 0) {
-		dout("start_read %p, error getting cap\n", inode);
+		doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
 		goto out;
 	}
 
 	if (!(got & want)) {
-		dout("start_read %p, no cache cap\n", inode);
+		doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
 		ret = -EACCES;
 		goto out;
 	}
@@ -563,13 +566,14 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
 		   struct ceph_snap_context *page_snapc)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_snap_context *snapc = NULL;
 	struct ceph_cap_snap *capsnap = NULL;
 
 	spin_lock(&ci->i_ceph_lock);
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
-		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
-		     capsnap->context, capsnap->dirty_pages);
+		doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
+		      capsnap, capsnap->context, capsnap->dirty_pages);
 		if (!capsnap->dirty_pages)
 			continue;
 
@@ -601,8 +605,8 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
 	}
 	if (!snapc && ci->i_wrbuffer_ref_head) {
 		snapc = ceph_get_snap_context(ci->i_head_snapc);
-		dout(" head snapc %p has %d dirty pages\n",
-		     snapc, ci->i_wrbuffer_ref_head);
+		doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
+		      ci->i_wrbuffer_ref_head);
 		if (ctl) {
 			ctl->i_size = i_size_read(inode);
 			ctl->truncate_size = ci->i_truncate_size;
@@ -658,7 +662,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct folio *folio = page_folio(page);
 	struct inode *inode = page->mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_snap_context *snapc, *oldest;
 	loff_t page_off = page_offset(page);
 	int err;
@@ -670,7 +675,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	bool caching = ceph_is_cache_enabled(inode);
 	struct page *bounce_page = NULL;
 
-	dout("writepage %p idx %lu\n", page, page->index);
+	doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
+	      page->index);
 
 	if (ceph_inode_is_shutdown(inode))
 		return -EIO;
@@ -678,13 +684,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	/* verify this is a writeable snap context */
 	snapc = page_snap_context(page);
 	if (!snapc) {
-		dout("writepage %p page %p not dirty?\n", inode, page);
+		doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
+		      page);
 		return 0;
 	}
 	oldest = get_oldest_context(inode, &ceph_wbc, snapc);
 	if (snapc->seq > oldest->seq) {
-		dout("writepage %p page %p snapc %p not writeable - noop\n",
-		     inode, page, snapc);
+		doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
+		      ceph_vinop(inode), page, snapc);
 		/* we should only noop if called by kswapd */
 		WARN_ON(!(current->flags & PF_MEMALLOC));
 		ceph_put_snap_context(oldest);
@@ -695,8 +702,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 
 	/* is this a partial page at end of file? */
 	if (page_off >= ceph_wbc.i_size) {
-		dout("folio at %lu beyond eof %llu\n", folio->index,
-				ceph_wbc.i_size);
+		doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
+		      ceph_vinop(inode), folio->index, ceph_wbc.i_size);
 		folio_invalidate(folio, 0, folio_size(folio));
 		return 0;
 	}
@@ -705,8 +712,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		len = ceph_wbc.i_size - page_off;
 
 	wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
-	dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
-	     inode, page, page->index, page_off, wlen, snapc, snapc->seq);
+	doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
+	      ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+	      snapc->seq);
 
 	if (atomic_long_inc_return(&fsc->writeback_count) >
 	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
@@ -747,10 +755,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	osd_req_op_extent_osd_data_pages(req, 0,
 			bounce_page ? &bounce_page : &page, wlen, 0,
 			false, false);
-	dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
-	     page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
+	doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
+	      ceph_vinop(inode), page_off, len, wlen,
+	      IS_ENCRYPTED(inode) ? "" : "not ");
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(osdc, req);
 	err = ceph_osdc_wait_request(osdc, req);
 
@@ -767,19 +776,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 			wbc = &tmp_wbc;
 		if (err == -ERESTARTSYS) {
 			/* killed by SIGKILL */
-			dout("writepage interrupted page %p\n", page);
+			doutc(cl, "%llx.%llx interrupted page %p\n",
+			      ceph_vinop(inode), page);
 			redirty_page_for_writepage(wbc, page);
 			end_page_writeback(page);
 			return err;
 		}
 		if (err == -EBLOCKLISTED)
 			fsc->blocklisted = true;
-		dout("writepage setting page/mapping error %d %p\n",
-		     err, page);
+		doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
+		      ceph_vinop(inode), err, page);
 		mapping_set_error(&inode->i_data, err);
 		wbc->pages_skipped++;
 	} else {
-		dout("writepage cleaned page %p\n", page);
+		doutc(cl, "%llx.%llx cleaned page %p\n",
+		      ceph_vinop(inode), page);
 		err = 0;  /* vfs expects us to return 0 */
 	}
 	oldest = detach_page_private(page);
@@ -803,7 +814,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 	ihold(inode);
 
 	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    ceph_inode_to_client(inode)->write_congested)
+	    ceph_inode_to_fs_client(inode)->write_congested)
 		return AOP_WRITEPAGE_ACTIVATE;
 
 	wait_on_page_fscache(page);
@@ -829,6 +840,7 @@ static void writepages_finish(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_osd_data *osd_data;
 	struct page *page;
 	int num_pages, total_pages = 0;
@@ -836,11 +848,11 @@ static void writepages_finish(struct ceph_osd_request *req)
 	int rc = req->r_result;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	unsigned int len = 0;
 	bool remove_page;
 
-	dout("writepages_finish %p rc %d\n", inode, rc);
+	doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
 	if (rc < 0) {
 		mapping_set_error(mapping, rc);
 		ceph_set_error_write(ci);
@@ -862,8 +874,10 @@ static void writepages_finish(struct ceph_osd_request *req)
 	/* clean all pages */
 	for (i = 0; i < req->r_num_ops; i++) {
 		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
-			pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
-				__func__, req->r_ops[i].op, req, i, req->r_tid);
+			pr_warn_client(cl,
+				"%llx.%llx incorrect op %d req %p index %d tid %llu\n",
+				ceph_vinop(inode), req->r_ops[i].op, req, i,
+				req->r_tid);
 			break;
 		}
 
@@ -890,7 +904,7 @@ static void writepages_finish(struct ceph_osd_request *req)
 
 			ceph_put_snap_context(detach_page_private(page));
 			end_page_writeback(page);
-			dout("unlocking %p\n", page);
+			doutc(cl, "unlocking %p\n", page);
 
 			if (remove_page)
 				generic_error_remove_page(inode->i_mapping,
@@ -898,8 +912,9 @@ static void writepages_finish(struct ceph_osd_request *req)
 
 			unlock_page(page);
 		}
-		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
-		     inode, osd_data->length, rc >= 0 ? num_pages : 0);
+		doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
+		      ceph_vinop(inode), osd_data->length,
+		      rc >= 0 ? num_pages : 0);
 
 		release_pages(osd_data->pages, num_pages);
 	}
@@ -926,7 +941,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_vino vino = ceph_vino(inode);
 	pgoff_t index, start_index, end = -1;
 	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
@@ -944,15 +960,15 @@ static int ceph_writepages_start(struct address_space *mapping,
 	    fsc->write_congested)
 		return 0;
 
-	dout("writepages_start %p (mode=%s)\n", inode,
-	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
-	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+	doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+	      wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	      (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
 	if (ceph_inode_is_shutdown(inode)) {
 		if (ci->i_wrbuffer_ref > 0) {
-			pr_warn_ratelimited(
-				"writepage_start %p %lld forced umount\n",
-				inode, ceph_ino(inode));
+			pr_warn_ratelimited_client(cl,
+				"%llx.%llx %lld forced umount\n",
+				ceph_vinop(inode), ceph_ino(inode));
 		}
 		mapping_set_error(mapping, -EIO);
 		return -EIO; /* we're in a forced umount, don't write! */
@@ -976,11 +992,11 @@ retry:
 	if (!snapc) {
 		/* hmm, why does writepages get called when there
 		   is no dirty data? */
-		dout(" no snap context with dirty data?\n");
+		doutc(cl, " no snap context with dirty data?\n");
 		goto out;
 	}
-	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
-	     snapc, snapc->seq, snapc->num_snaps);
+	doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
+	      snapc->seq, snapc->num_snaps);
 
 	should_loop = false;
 	if (ceph_wbc.head_snapc && snapc != last_snapc) {
@@ -990,13 +1006,13 @@ retry:
 			end = -1;
 			if (index > 0)
 				should_loop = true;
-			dout(" cyclic, start at %lu\n", index);
+			doutc(cl, " cyclic, start at %lu\n", index);
 		} else {
 			index = wbc->range_start >> PAGE_SHIFT;
 			end = wbc->range_end >> PAGE_SHIFT;
 			if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 				range_whole = true;
-			dout(" not cyclic, %lu to %lu\n", index, end);
+			doutc(cl, " not cyclic, %lu to %lu\n", index, end);
 		}
 	} else if (!ceph_wbc.head_snapc) {
 		/* Do not respect wbc->range_{start,end}. Dirty pages
@@ -1005,7 +1021,7 @@ retry:
 		 * associated with 'snapc' get written */
 		if (index > 0)
 			should_loop = true;
-		dout(" non-head snapc, range whole\n");
+		doutc(cl, " non-head snapc, range whole\n");
 	}
 
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -1028,12 +1044,12 @@ retry:
 get_more_pages:
 		nr_folios = filemap_get_folios_tag(mapping, &index,
 						   end, tag, &fbatch);
-		dout("pagevec_lookup_range_tag got %d\n", nr_folios);
+		doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
 		if (!nr_folios && !locked_pages)
 			break;
 		for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
 			page = &fbatch.folios[i]->page;
-			dout("? %p idx %lu\n", page, page->index);
+			doutc(cl, "? %p idx %lu\n", page, page->index);
 			if (locked_pages == 0)
 				lock_page(page);  /* first page */
 			else if (!trylock_page(page))
@@ -1042,15 +1058,15 @@ get_more_pages:
 			/* only dirty pages, or our accounting breaks */
 			if (unlikely(!PageDirty(page)) ||
 			    unlikely(page->mapping != mapping)) {
-				dout("!dirty or !mapping %p\n", page);
+				doutc(cl, "!dirty or !mapping %p\n", page);
 				unlock_page(page);
 				continue;
 			}
 			/* only if matching snap context */
 			pgsnapc = page_snap_context(page);
 			if (pgsnapc != snapc) {
-				dout("page snapc %p %lld != oldest %p %lld\n",
-				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
+				doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
+				      pgsnapc, pgsnapc->seq, snapc, snapc->seq);
 				if (!should_loop &&
 				    !ceph_wbc.head_snapc &&
 				    wbc->sync_mode != WB_SYNC_NONE)
@@ -1061,8 +1077,8 @@ get_more_pages:
 			if (page_offset(page) >= ceph_wbc.i_size) {
 				struct folio *folio = page_folio(page);
 
-				dout("folio at %lu beyond eof %llu\n",
-				     folio->index, ceph_wbc.i_size);
+				doutc(cl, "folio at %lu beyond eof %llu\n",
+				      folio->index, ceph_wbc.i_size);
 				if ((ceph_wbc.size_stable ||
 				    folio_pos(folio) >= i_size_read(inode)) &&
 				    folio_clear_dirty_for_io(folio))
@@ -1072,23 +1088,23 @@ get_more_pages:
 				continue;
 			}
 			if (strip_unit_end && (page->index > strip_unit_end)) {
-				dout("end of strip unit %p\n", page);
+				doutc(cl, "end of strip unit %p\n", page);
 				unlock_page(page);
 				break;
 			}
 			if (PageWriteback(page) || PageFsCache(page)) {
 				if (wbc->sync_mode == WB_SYNC_NONE) {
-					dout("%p under writeback\n", page);
+					doutc(cl, "%p under writeback\n", page);
 					unlock_page(page);
 					continue;
 				}
-				dout("waiting on writeback %p\n", page);
+				doutc(cl, "waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
 				wait_on_page_fscache(page);
 			}
 
 			if (!clear_page_dirty_for_io(page)) {
-				dout("%p !clear_page_dirty_for_io\n", page);
+				doutc(cl, "%p !clear_page_dirty_for_io\n", page);
 				unlock_page(page);
 				continue;
 			}
@@ -1143,8 +1159,8 @@ get_more_pages:
 			}
 
 			/* note position of first page in fbatch */
-			dout("%p will write page %p idx %lu\n",
-			     inode, page, page->index);
+			doutc(cl, "%llx.%llx will write page %p idx %lu\n",
+			      ceph_vinop(inode), page, page->index);
 
 			if (atomic_long_inc_return(&fsc->writeback_count) >
 			    CONGESTION_ON_THRESH(
@@ -1158,8 +1174,9 @@ get_more_pages:
 						locked_pages ? GFP_NOWAIT : GFP_NOFS);
 				if (IS_ERR(pages[locked_pages])) {
 					if (PTR_ERR(pages[locked_pages]) == -EINVAL)
-						pr_err("%s: inode->i_blkbits=%hhu\n",
-							__func__, inode->i_blkbits);
+						pr_err_client(cl,
+							"inode->i_blkbits=%hhu\n",
+							inode->i_blkbits);
 					/* better not fail on first page! */
 					BUG_ON(locked_pages == 0);
 					pages[locked_pages] = NULL;
@@ -1193,7 +1210,7 @@ get_more_pages:
 
 			if (nr_folios && i == nr_folios &&
 			    locked_pages < max_pages) {
-				dout("reached end fbatch, trying for more\n");
+				doutc(cl, "reached end fbatch, trying for more\n");
 				folio_batch_release(&fbatch);
 				goto get_more_pages;
 			}
@@ -1254,8 +1271,8 @@ new_request:
 				/* Start a new extent */
 				osd_req_op_extent_dup_last(req, op_idx,
 							   cur_offset - offset);
-				dout("writepages got pages at %llu~%llu\n",
-				     offset, len);
+				doutc(cl, "got pages at %llu~%llu\n", offset,
+				      len);
 				osd_req_op_extent_osd_data_pages(req, op_idx,
 							data_pages, len, 0,
 							from_pool, false);
@@ -1288,12 +1305,13 @@ new_request:
 		if (IS_ENCRYPTED(inode))
 			len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
 
-		dout("writepages got pages at %llu~%llu\n", offset, len);
+		doutc(cl, "got pages at %llu~%llu\n", offset, len);
 
 		if (IS_ENCRYPTED(inode) &&
 		    ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
-			pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
-				__func__, offset, len);
+			pr_warn_client(cl,
+				"bad encrypted write offset=%lld len=%llu\n",
+				offset, len);
 
 		osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
 						 0, from_pool, false);
@@ -1327,7 +1345,7 @@ new_request:
 			pages = NULL;
 		}
 
-		req->r_mtime = inode->i_mtime;
+		req->r_mtime = inode_get_mtime(inode);
 		ceph_osdc_start_request(&fsc->client->osdc, req);
 		req = NULL;
 
@@ -1345,14 +1363,14 @@ new_request:
 			done = true;
 
 release_folios:
-		dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr,
-		     fbatch.nr ? fbatch.folios[0] : NULL);
+		doutc(cl, "folio_batch release on %d folios (%p)\n",
+		      (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
 		folio_batch_release(&fbatch);
 	}
 
 	if (should_loop && !done) {
 		/* more to do; loop back to beginning of file */
-		dout("writepages looping back to beginning of file\n");
+		doutc(cl, "looping back to beginning of file\n");
 		end = start_index - 1; /* OK even when start_index == 0 */
 
 		/* to write dirty pages associated with next snapc,
@@ -1390,7 +1408,8 @@ release_folios:
 out:
 	ceph_osdc_put_request(req);
 	ceph_put_snap_context(last_snapc);
-	dout("writepages dend - startone, rc = %d\n", rc);
+	doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
+	      rc);
 	return rc;
 }
 
@@ -1424,11 +1443,12 @@ static struct ceph_snap_context *
 ceph_find_incompatible(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	if (ceph_inode_is_shutdown(inode)) {
-		dout(" page %p %llx:%llx is shutdown\n", page,
-		     ceph_vinop(inode));
+		doutc(cl, " %llx.%llx page %p is shutdown\n",
+		      ceph_vinop(inode), page);
 		return ERR_PTR(-ESTALE);
 	}
 
@@ -1449,13 +1469,15 @@ ceph_find_incompatible(struct page *page)
 		if (snapc->seq > oldest->seq) {
 			/* not writeable -- return it for the caller to deal with */
 			ceph_put_snap_context(oldest);
-			dout(" page %p snapc %p not current or oldest\n", page, snapc);
+			doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
+			      ceph_vinop(inode), page, snapc);
 			return ceph_get_snap_context(snapc);
 		}
 		ceph_put_snap_context(oldest);
 
 		/* yay, writeable, do it now (without dropping page lock) */
-		dout(" page %p snapc %p not current, but oldest\n", page, snapc);
+		doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
+		      ceph_vinop(inode), page, snapc);
 		if (clear_page_dirty_for_io(page)) {
 			int r = writepage_nounlock(page, NULL);
 			if (r < 0)
@@ -1524,10 +1546,11 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 {
 	struct folio *folio = page_folio(subpage);
 	struct inode *inode = file_inode(file);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	bool check_cap = false;
 
-	dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
-	     inode, folio, (int)pos, (int)copied, (int)len);
+	doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
+	      file, folio, (int)pos, (int)copied, (int)len);
 
 	if (!folio_test_uptodate(folio)) {
 		/* just return that nothing was copied on a short copy */
@@ -1587,6 +1610,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
 	loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
 	int want, got, err;
@@ -1598,8 +1622,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 
 	ceph_block_sigs(&oldset);
 
-	dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
-	     inode, ceph_vinop(inode), off);
+	doutc(cl, "%llx.%llx %llu trying to get caps\n",
+	      ceph_vinop(inode), off);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1610,8 +1634,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 	if (err < 0)
 		goto out_restore;
 
-	dout("filemap_fault %p %llu got cap refs on %s\n",
-	     inode, off, ceph_cap_string(got));
+	doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
+	      off, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    !ceph_has_inline_data(ci)) {
@@ -1619,8 +1643,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 		ceph_add_rw_context(fi, &rw_ctx);
 		ret = filemap_fault(vmf);
 		ceph_del_rw_context(fi, &rw_ctx);
-		dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
-		     inode, off, ceph_cap_string(got), ret);
+		doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
+		      ceph_vinop(inode), off, ceph_cap_string(got), ret);
 	} else
 		err = -EAGAIN;
 
@@ -1661,8 +1685,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
 out_inline:
 		filemap_invalidate_unlock_shared(mapping);
-		dout("filemap_fault %p %llu read inline data ret %x\n",
-		     inode, off, ret);
+		doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
+		      ceph_vinop(inode), off, ret);
 	}
 out_restore:
 	ceph_restore_sigs(&oldset);
@@ -1676,6 +1700,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
 	struct ceph_cap_flush *prealloc_cf;
@@ -1702,8 +1727,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	else
 		len = offset_in_thp(page, size);
 
-	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
-	     inode, ceph_vinop(inode), off, len, size);
+	doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
+	      ceph_vinop(inode), off, len, size);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1714,8 +1739,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	if (err < 0)
 		goto out_free;
 
-	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
-	     inode, off, len, ceph_cap_string(got));
+	doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
+	      off, len, ceph_cap_string(got));
 
 	/* Update time before taking page lock */
 	file_update_time(vma->vm_file);
@@ -1763,8 +1788,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 			__mark_inode_dirty(inode, dirty);
 	}
 
-	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
-	     inode, off, len, ceph_cap_string(got), ret);
+	doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
+	      ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
 	ceph_put_cap_refs_async(ci, got);
 out_free:
 	ceph_restore_sigs(&oldset);
@@ -1778,6 +1803,7 @@ out_free:
 void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 			   char	*data, size_t len)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 
@@ -1798,8 +1824,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 		}
 	}
 
-	dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
-	     inode, ceph_vinop(inode), len, locked_page);
+	doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
+	      ceph_vinop(inode), len, locked_page);
 
 	if (len > 0) {
 		void *kaddr = kmap_atomic(page);
@@ -1823,7 +1849,8 @@ int ceph_uninline_data(struct file *file)
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_request *req = NULL;
 	struct ceph_cap_flush *prealloc_cf = NULL;
 	struct folio *folio = NULL;
@@ -1836,8 +1863,8 @@ int ceph_uninline_data(struct file *file)
 	inline_version = ci->i_inline_version;
 	spin_unlock(&ci->i_ceph_lock);
 
-	dout("uninline_data %p %llx.%llx inline_version %llu\n",
-	     inode, ceph_vinop(inode), inline_version);
+	doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
+	      inline_version);
 
 	if (ceph_inode_is_shutdown(inode)) {
 		err = -EIO;
@@ -1875,7 +1902,7 @@ int ceph_uninline_data(struct file *file)
 		goto out_unlock;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	ceph_osdc_put_request(req);
@@ -1917,7 +1944,7 @@ int ceph_uninline_data(struct file *file)
 			goto out_put_req;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
@@ -1949,8 +1976,8 @@ out_unlock:
 	}
 out:
 	ceph_free_cap_flush(prealloc_cf);
-	dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
-	     inode, ceph_vinop(inode), inline_version, err);
+	doutc(cl, "%llx.%llx inline_version %llu = %d\n",
+	      ceph_vinop(inode), inline_version, err);
 	return err;
 }
 
@@ -1977,8 +2004,9 @@ enum {
 static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 				s64 pool, struct ceph_string *pool_ns)
 {
-	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
 	struct rb_node **p, *parent;
 	struct ceph_pool_perm *perm;
@@ -2013,10 +2041,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 		goto out;
 
 	if (pool_ns)
-		dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
-		     pool, (int)pool_ns->len, pool_ns->str);
+		doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
+		      (int)pool_ns->len, pool_ns->str);
 	else
-		dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
+		doutc(cl, "pool %lld no perm cached\n", pool);
 
 	down_write(&mdsc->pool_perm_rwsem);
 	p = &mdsc->pool_perm_tree.rb_node;
@@ -2092,7 +2120,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 				     0, false, true);
 	ceph_osdc_start_request(&fsc->client->osdc, rd_req);
 
-	wr_req->r_mtime = ci->netfs.inode.i_mtime;
+	wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
 	ceph_osdc_start_request(&fsc->client->osdc, wr_req);
 
 	err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
@@ -2141,15 +2169,16 @@ out:
 	if (!err)
 		err = have;
 	if (pool_ns)
-		dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
-		     pool, (int)pool_ns->len, pool_ns->str, err);
+		doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
+		      (int)pool_ns->len, pool_ns->str, err);
 	else
-		dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
+		doutc(cl, "pool %lld result = %d\n", pool, err);
 	return err;
 }
 
 int ceph_pool_perm_check(struct inode *inode, int need)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_string *pool_ns;
 	s64 pool;
@@ -2168,7 +2197,7 @@ int ceph_pool_perm_check(struct inode *inode, int need)
 		return 0;
 	}
 
-	if (ceph_test_mount_opt(ceph_inode_to_client(inode),
+	if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
 				NOPOOLPERM))
 		return 0;
 
@@ -2179,13 +2208,11 @@ int ceph_pool_perm_check(struct inode *inode, int need)
 check:
 	if (flags & CEPH_I_POOL_PERM) {
 		if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
-			dout("ceph_pool_perm_check pool %lld no read perm\n",
-			     pool);
+			doutc(cl, "pool %lld no read perm\n", pool);
 			return -EPERM;
 		}
 		if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
-			dout("ceph_pool_perm_check pool %lld no write perm\n",
-			     pool);
+			doutc(cl, "pool %lld no write perm\n", pool);
 			return -EPERM;
 		}
 		return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index de1dee46d3df..930fbd54d2c8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -15,7 +15,7 @@
 void ceph_fscache_register_inode_cookie(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 
 	/* No caching for filesystem? */
 	if (!fsc->fscache)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 14215ec646f7..2c0b8dc3dd0d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -186,10 +186,10 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
 			mdsc->caps_avail_count += nr_caps;
 		}
 
-		dout("%s: caps %d = %d used + %d resv + %d avail\n",
-		     __func__,
-		     mdsc->caps_total_count, mdsc->caps_use_count,
-		     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+		doutc(mdsc->fsc->client,
+		      "caps %d = %d used + %d resv + %d avail\n",
+		      mdsc->caps_total_count, mdsc->caps_use_count,
+		      mdsc->caps_reserve_count, mdsc->caps_avail_count);
 		BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
 						 mdsc->caps_reserve_count +
 						 mdsc->caps_avail_count);
@@ -202,6 +202,7 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
 int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 		      struct ceph_cap_reservation *ctx, int need)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int i, j;
 	struct ceph_cap *cap;
 	int have;
@@ -212,7 +213,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *s;
 	LIST_HEAD(newcaps);
 
-	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+	doutc(cl, "ctx=%p need=%d\n", ctx, need);
 
 	/* first reserve any caps that are already allocated */
 	spin_lock(&mdsc->caps_list_lock);
@@ -272,8 +273,8 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			continue;
 		}
 
-		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
-			ctx, need, have + alloc);
+		pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need,
+			       have + alloc);
 		err = -ENOMEM;
 		break;
 	}
@@ -298,20 +299,21 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 
 	spin_unlock(&mdsc->caps_list_lock);
 
-	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
-	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
-	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx,
+	      mdsc->caps_total_count, mdsc->caps_use_count,
+	      mdsc->caps_reserve_count, mdsc->caps_avail_count);
 	return err;
 }
 
 void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 			 struct ceph_cap_reservation *ctx)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	bool reclaim = false;
 	if (!ctx->count)
 		return;
 
-	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count);
 	spin_lock(&mdsc->caps_list_lock);
 	__ceph_unreserve_caps(mdsc, ctx->count);
 	ctx->count = 0;
@@ -328,6 +330,7 @@ void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 			      struct ceph_cap_reservation *ctx)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap *cap = NULL;
 
 	/* temporary, until we do something about cap import/export */
@@ -359,9 +362,9 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 	}
 
 	spin_lock(&mdsc->caps_list_lock);
-	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-	     ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
-	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx,
+	      ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
+	      mdsc->caps_reserve_count, mdsc->caps_avail_count);
 	BUG_ON(!ctx->count);
 	BUG_ON(ctx->count > mdsc->caps_reserve_count);
 	BUG_ON(list_empty(&mdsc->caps_list));
@@ -382,10 +385,12 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
 
 void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
+
 	spin_lock(&mdsc->caps_list_lock);
-	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
-	     cap, mdsc->caps_total_count, mdsc->caps_use_count,
-	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
+	doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap,
+	      mdsc->caps_total_count, mdsc->caps_use_count,
+	      mdsc->caps_reserve_count, mdsc->caps_avail_count);
 	mdsc->caps_use_count--;
 	/*
 	 * Keep some preallocated caps around (ceph_min_count), to
@@ -491,11 +496,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
+	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+
 	ci->i_hold_caps_max = round_jiffies(jiffies +
 					    opt->caps_wanted_delay_max * HZ);
-	dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
-	     ci->i_hold_caps_max - jiffies);
+	doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode,
+	      ceph_vinop(inode), ci->i_hold_caps_max - jiffies);
 }
 
 /*
@@ -509,8 +516,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 				struct ceph_inode_info *ci)
 {
-	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
-	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	struct inode *inode = &ci->netfs.inode;
+
+	doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n",
+	      inode, ceph_vinop(inode), ci->i_ceph_flags,
+	      ci->i_hold_caps_max);
 	if (!mdsc->stopping) {
 		spin_lock(&mdsc->cap_delay_lock);
 		if (!list_empty(&ci->i_cap_delay_list)) {
@@ -533,7 +543,9 @@ no_change:
 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 				      struct ceph_inode_info *ci)
 {
-	dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
+	struct inode *inode = &ci->netfs.inode;
+
+	doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	spin_lock(&mdsc->cap_delay_lock);
 	ci->i_ceph_flags |= CEPH_I_FLUSH;
 	if (!list_empty(&ci->i_cap_delay_list))
@@ -550,7 +562,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
+	struct inode *inode = &ci->netfs.inode;
+
+	doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	if (list_empty(&ci->i_cap_delay_list))
 		return;
 	spin_lock(&mdsc->cap_delay_lock);
@@ -562,6 +576,9 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 			      unsigned issued)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+
 	unsigned had = __ceph_caps_issued(ci, NULL);
 
 	lockdep_assert_held(&ci->i_ceph_lock);
@@ -586,7 +603,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 		if (issued & CEPH_CAP_FILE_SHARED)
 			atomic_inc(&ci->i_shared_gen);
 		if (S_ISDIR(ci->netfs.inode.i_mode)) {
-			dout(" marking %p NOT complete\n", &ci->netfs.inode);
+			doutc(cl, " marking %p NOT complete\n", inode);
 			__ceph_dir_clear_complete(ci);
 		}
 	}
@@ -635,7 +652,8 @@ void ceph_add_cap(struct inode *inode,
 		  unsigned seq, unsigned mseq, u64 realmino, int flags,
 		  struct ceph_cap **new_cap)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *cap;
 	int mds = session->s_mds;
@@ -644,8 +662,9 @@ void ceph_add_cap(struct inode *inode,
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
-	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+	doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode,
+	      ceph_vinop(inode), session->s_mds, cap_id,
+	      ceph_cap_string(issued), seq);
 
 	gen = atomic_read(&session->s_cap_gen);
 
@@ -723,9 +742,9 @@ void ceph_add_cap(struct inode *inode,
 	actual_wanted = __ceph_caps_wanted(ci);
 	if ((wanted & ~actual_wanted) ||
 	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
-		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
-		     ceph_cap_string(issued), ceph_cap_string(wanted),
-		     ceph_cap_string(actual_wanted));
+		doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n",
+		      ceph_cap_string(issued), ceph_cap_string(wanted),
+		      ceph_cap_string(actual_wanted));
 		__cap_delay_requeue(mdsc, ci);
 	}
 
@@ -742,9 +761,9 @@ void ceph_add_cap(struct inode *inode,
 		WARN_ON(ci->i_auth_cap == cap);
 	}
 
-	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
-	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
-	     ceph_cap_string(issued|cap->issued), seq, mds);
+	doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n",
+	      inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	      ceph_cap_string(issued|cap->issued), seq, mds);
 	cap->cap_id = cap_id;
 	cap->issued = issued;
 	cap->implemented |= issued;
@@ -766,6 +785,8 @@ void ceph_add_cap(struct inode *inode,
  */
 static int __cap_is_valid(struct ceph_cap *cap)
 {
+	struct inode *inode = &cap->ci->netfs.inode;
+	struct ceph_client *cl = cap->session->s_mdsc->fsc->client;
 	unsigned long ttl;
 	u32 gen;
 
@@ -773,9 +794,9 @@ static int __cap_is_valid(struct ceph_cap *cap)
 	ttl = cap->session->s_cap_ttl;
 
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
-		dout("__cap_is_valid %p cap %p issued %s "
-		     "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
-		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+		doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n",
+		      inode, ceph_vinop(inode), cap,
+		      ceph_cap_string(cap->issued), cap->cap_gen, gen);
 		return 0;
 	}
 
@@ -789,6 +810,8 @@ static int __cap_is_valid(struct ceph_cap *cap)
  */
 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int have = ci->i_snap_caps;
 	struct ceph_cap *cap;
 	struct rb_node *p;
@@ -799,8 +822,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 		cap = rb_entry(p, struct ceph_cap, ci_node);
 		if (!__cap_is_valid(cap))
 			continue;
-		dout("__ceph_caps_issued %p cap %p issued %s\n",
-		     &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
+		doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode,
+		      ceph_vinop(inode), cap, ceph_cap_string(cap->issued));
 		have |= cap->issued;
 		if (implemented)
 			*implemented |= cap->implemented;
@@ -843,16 +866,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
  */
 static void __touch_cap(struct ceph_cap *cap)
 {
+	struct inode *inode = &cap->ci->netfs.inode;
 	struct ceph_mds_session *s = cap->session;
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
 
 	spin_lock(&s->s_cap_lock);
 	if (!s->s_cap_iterator) {
-		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
-		     s->s_mds);
+		doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode,
+		      ceph_vinop(inode), cap, s->s_mds);
 		list_move_tail(&cap->session_caps, &s->s_caps);
 	} else {
-		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
-		     &cap->ci->netfs.inode, cap, s->s_mds);
+		doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n",
+		      inode, ceph_vinop(inode), cap, s->s_mds);
 	}
 	spin_unlock(&s->s_cap_lock);
 }
@@ -864,15 +889,16 @@ static void __touch_cap(struct ceph_cap *cap)
  */
 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_cap *cap;
 	struct rb_node *p;
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
-		     " (mask %s)\n", ceph_ino(&ci->netfs.inode),
-		     ceph_cap_string(have),
-		     ceph_cap_string(mask));
+		doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n",
+		      inode, ceph_vinop(inode), ceph_cap_string(have),
+		      ceph_cap_string(mask));
 		return 1;
 	}
 
@@ -881,10 +907,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
-			     " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
-			     ceph_cap_string(cap->issued),
-			     ceph_cap_string(mask));
+			doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n",
+			      inode, ceph_vinop(inode), cap,
+			      ceph_cap_string(cap->issued),
+			      ceph_cap_string(mask));
 			if (touch)
 				__touch_cap(cap);
 			return 1;
@@ -893,10 +919,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
-			     " (mask %s)\n", ceph_ino(&ci->netfs.inode),
-			     ceph_cap_string(cap->issued),
-			     ceph_cap_string(mask));
+			doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n",
+			      inode, ceph_vinop(inode),
+			      ceph_cap_string(cap->issued),
+			      ceph_cap_string(mask));
 			if (touch) {
 				struct rb_node *q;
 
@@ -922,7 +948,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
 				   int touch)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
 	int r;
 
 	r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -954,13 +980,14 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
 {
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int ret;
 
 	spin_lock(&ci->i_ceph_lock);
 	ret = __ceph_caps_revoking_other(ci, NULL, mask);
 	spin_unlock(&ci->i_ceph_lock);
-	dout("ceph_caps_revoking %p %s = %d\n", inode,
-	     ceph_cap_string(mask), ret);
+	doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode),
+	      ceph_cap_string(mask), ret);
 	return ret;
 }
 
@@ -996,7 +1023,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
 	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
 	struct ceph_mount_options *opt =
-		ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+		ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
 	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
 	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
 
@@ -1107,21 +1134,23 @@ int ceph_is_any_caps(struct inode *inode)
 void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 {
 	struct ceph_mds_session *session = cap->session;
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mds_client *mdsc;
 	int removed = 0;
 
 	/* 'ci' being NULL means the remove have already occurred */
 	if (!ci) {
-		dout("%s: cap inode is NULL\n", __func__);
+		doutc(cl, "inode is NULL\n");
 		return;
 	}
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
+	doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode));
 
-	mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
+	mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc;
 
 	/* remove from inode's cap rbtree, and clear auth cap */
 	rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1132,8 +1161,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 	spin_lock(&session->s_cap_lock);
 	if (session->s_cap_iterator == cap) {
 		/* not yet, we are iterating over this very cap */
-		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
-		     cap, cap->session);
+		doutc(cl, "delaying %p removal from session %p\n", cap,
+		      cap->session);
 	} else {
 		list_del_init(&cap->session_caps);
 		session->s_nr_caps--;
@@ -1178,20 +1207,21 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 	}
 }
 
-void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
+void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		     bool queue_release)
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_fs_client *fsc;
 
 	/* 'ci' being NULL means the remove have already occurred */
 	if (!ci) {
-		dout("%s: cap inode is NULL\n", __func__);
+		doutc(mdsc->fsc->client, "inode is NULL\n");
 		return;
 	}
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	fsc = ceph_inode_to_client(&ci->netfs.inode);
+	fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
 	WARN_ON_ONCE(ci->i_auth_cap == cap &&
 		     !list_empty(&ci->i_dirty_item) &&
 		     !fsc->blocklisted &&
@@ -1227,15 +1257,19 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
 {
 	struct ceph_mds_caps *fc;
 	void *p;
-	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
-
-	dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
-	     __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
-	     ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
-	     ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
-	     arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
-	     arg->size, arg->max_size, arg->xattr_version,
-	     arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
+	struct ceph_mds_client *mdsc = arg->session->s_mdsc;
+	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+
+	doutc(mdsc->fsc->client,
+	      "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u"
+	      " tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+	      " xattr_ver %llu xattr_len %d\n",
+	      ceph_cap_op_name(arg->op), arg->cid, arg->ino,
+	      ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
+	      ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
+	      arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
+	      arg->size, arg->max_size, arg->xattr_version,
+	      arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
 
 	msg->hdr.version = cpu_to_le16(12);
 	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1342,6 +1376,8 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
  */
 void __ceph_remove_caps(struct ceph_inode_info *ci)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
 	struct rb_node *p;
 
 	/* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
@@ -1351,7 +1387,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
 		p = rb_next(p);
-		ceph_remove_cap(cap, true);
+		ceph_remove_cap(mdsc, cap, true);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 }
@@ -1370,6 +1406,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int held, revoking;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
@@ -1378,10 +1415,10 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 	revoking = cap->implemented & ~cap->issued;
 	retain &= ~revoking;
 
-	dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
-	     __func__, inode, cap, cap->session,
-	     ceph_cap_string(held), ceph_cap_string(held & retain),
-	     ceph_cap_string(revoking));
+	doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n",
+	      inode, ceph_vinop(inode), cap, cap->session,
+	      ceph_cap_string(held), ceph_cap_string(held & retain),
+	      ceph_cap_string(revoking));
 	BUG_ON((retain & CEPH_CAP_PIN) == 0);
 
 	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
@@ -1421,8 +1458,8 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 		arg->old_xattr_buf = NULL;
 	}
 
-	arg->mtime = inode->i_mtime;
-	arg->atime = inode->i_atime;
+	arg->mtime = inode_get_mtime(inode);
+	arg->atime = inode_get_atime(inode);
 	arg->ctime = inode_get_ctime(inode);
 	arg->btime = ci->i_btime;
 	arg->change_attr = inode_peek_iversion_raw(inode);
@@ -1497,13 +1534,16 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
 {
 	struct ceph_msg *msg;
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
 			   false);
 	if (!msg) {
-		pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
-		       ceph_vinop(inode), ceph_cap_string(arg->dirty),
-		       arg->flush_tid);
+		pr_err_client(cl,
+			      "error allocating cap msg: ino (%llx.%llx)"
+			      " flushing %s tid %llu, requeuing cap.\n",
+			      ceph_vinop(inode), ceph_cap_string(arg->dirty),
+			      arg->flush_tid);
 		spin_lock(&ci->i_ceph_lock);
 		__cap_delay_requeue(arg->session->s_mdsc, ci);
 		spin_unlock(&ci->i_ceph_lock);
@@ -1592,11 +1632,13 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
 {
 	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap_snap *capsnap;
 	u64 oldest_flush_tid = 0;
 	u64 first_tid = 1, last_tid = 0;
 
-	dout("__flush_snaps %p session %p\n", inode, session);
+	doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode),
+	      session);
 
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 		/*
@@ -1611,7 +1653,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
 
 		/* only flush each capsnap once */
 		if (capsnap->cap_flush.tid > 0) {
-			dout(" already flushed %p, skipping\n", capsnap);
+			doutc(cl, "already flushed %p, skipping\n", capsnap);
 			continue;
 		}
 
@@ -1643,8 +1685,8 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
 		int ret;
 
 		if (!(cap && cap->session == session)) {
-			dout("__flush_snaps %p auth cap %p not mds%d, "
-			     "stop\n", inode, cap, session->s_mds);
+			doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n",
+			      inode, ceph_vinop(inode), cap, session->s_mds);
 			break;
 		}
 
@@ -1665,15 +1707,17 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
 		refcount_inc(&capsnap->nref);
 		spin_unlock(&ci->i_ceph_lock);
 
-		dout("__flush_snaps %p capsnap %p tid %llu %s\n",
-		     inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+		doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode,
+		      ceph_vinop(inode), capsnap, cf->tid,
+		      ceph_cap_string(capsnap->dirty));
 
 		ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
 					oldest_flush_tid);
 		if (ret < 0) {
-			pr_err("__flush_snaps: error sending cap flushsnap, "
-			       "ino (%llx.%llx) tid %llu follows %llu\n",
-				ceph_vinop(inode), cf->tid, capsnap->follows);
+			pr_err_client(cl, "error sending cap flushsnap, "
+				      "ino (%llx.%llx) tid %llu follows %llu\n",
+				      ceph_vinop(inode), cf->tid,
+				      capsnap->follows);
 		}
 
 		ceph_put_cap_snap(capsnap);
@@ -1685,28 +1729,29 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
 		      struct ceph_mds_session **psession)
 {
 	struct inode *inode = &ci->netfs.inode;
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_mds_session *session = NULL;
 	bool need_put = false;
 	int mds;
 
-	dout("ceph_flush_snaps %p\n", inode);
+	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	if (psession)
 		session = *psession;
 retry:
 	spin_lock(&ci->i_ceph_lock);
 	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
-		dout(" no capsnap needs flush, doing nothing\n");
+		doutc(cl, " no capsnap needs flush, doing nothing\n");
 		goto out;
 	}
 	if (!ci->i_auth_cap) {
-		dout(" no auth cap (migrating?), doing nothing\n");
+		doutc(cl, " no auth cap (migrating?), doing nothing\n");
 		goto out;
 	}
 
 	mds = ci->i_auth_cap->session->s_mds;
 	if (session && session->s_mds != mds) {
-		dout(" oops, wrong session %p mutex\n", session);
+		doutc(cl, " oops, wrong session %p mutex\n", session);
 		ceph_put_mds_session(session);
 		session = NULL;
 	}
@@ -1750,23 +1795,25 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 			   struct ceph_cap_flush **pcf)
 {
 	struct ceph_mds_client *mdsc =
-		ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+		ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc;
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int was = ci->i_dirty_caps;
 	int dirty = 0;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
 	if (!ci->i_auth_cap) {
-		pr_warn("__mark_dirty_caps %p %llx mask %s, "
-			"but no auth cap (session was closed?)\n",
-			inode, ceph_ino(inode), ceph_cap_string(mask));
+		pr_warn_client(cl, "%p %llx.%llx mask %s, "
+			       "but no auth cap (session was closed?)\n",
+				inode, ceph_vinop(inode),
+				ceph_cap_string(mask));
 		return 0;
 	}
 
-	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
-	     ceph_cap_string(mask), ceph_cap_string(was),
-	     ceph_cap_string(was | mask));
+	doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode,
+	      ceph_vinop(inode), ceph_cap_string(mask),
+	      ceph_cap_string(was), ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
 		struct ceph_mds_session *session = ci->i_auth_cap->session;
@@ -1779,8 +1826,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 			ci->i_head_snapc = ceph_get_snap_context(
 				ci->i_snap_realm->cached_context);
 		}
-		dout(" inode %p now dirty snapc %p auth cap %p\n",
-		     &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
+		doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n",
+		      inode, ceph_vinop(inode), ci->i_head_snapc,
+		      ci->i_auth_cap);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1873,7 +1921,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
 				struct ceph_mds_session *session, bool wake,
 				u64 *oldest_flush_tid)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap_flush *cf = NULL;
 	int flushing;
@@ -1884,13 +1933,13 @@ static u64 __mark_caps_flushing(struct inode *inode,
 	BUG_ON(!ci->i_prealloc_cap_flush);
 
 	flushing = ci->i_dirty_caps;
-	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
-	     ceph_cap_string(flushing),
-	     ceph_cap_string(ci->i_flushing_caps),
-	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	doutc(cl, "flushing %s, flushing_caps %s -> %s\n",
+	      ceph_cap_string(flushing),
+	      ceph_cap_string(ci->i_flushing_caps),
+	      ceph_cap_string(ci->i_flushing_caps | flushing));
 	ci->i_flushing_caps |= flushing;
 	ci->i_dirty_caps = 0;
-	dout(" inode %p now !dirty\n", inode);
+	doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
 
 	swap(cf, ci->i_prealloc_cap_flush);
 	cf->caps = flushing;
@@ -1921,6 +1970,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	__releases(ci->i_ceph_lock)
 	__acquires(ci->i_ceph_lock)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u32 invalidating_gen = ci->i_rdcache_gen;
 
@@ -1932,12 +1982,13 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	if (inode->i_data.nrpages == 0 &&
 	    invalidating_gen == ci->i_rdcache_gen) {
 		/* success. */
-		dout("try_nonblocking_invalidate %p success\n", inode);
+		doutc(cl, "%p %llx.%llx success\n", inode,
+		      ceph_vinop(inode));
 		/* save any racing async invalidate some trouble */
 		ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
 		return 0;
 	}
-	dout("try_nonblocking_invalidate %p failed\n", inode);
+	doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode));
 	return -1;
 }
 
@@ -1969,6 +2020,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags)
 {
 	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_cap *cap;
 	u64 flush_tid, oldest_flush_tid;
 	int file_wanted, used, cap_used;
@@ -2043,9 +2095,9 @@ retry:
 		}
 	}
 
-	dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
-	     " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
-	     ceph_cap_string(file_wanted),
+	doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s "
+	      "flushing %s issued %s revoking %s retain %s %s%s%s\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(file_wanted),
 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_flushing_caps),
 	     ceph_cap_string(issued), ceph_cap_string(revoking),
@@ -2066,10 +2118,10 @@ retry:
 	    (revoking & (CEPH_CAP_FILE_CACHE|
 			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
 	    !tried_invalidate) {
-		dout("check_caps trying to invalidate on %llx.%llx\n",
-		     ceph_vinop(inode));
+		doutc(cl, "trying to invalidate on %p %llx.%llx\n",
+		      inode, ceph_vinop(inode));
 		if (try_nonblocking_invalidate(inode) < 0) {
-			dout("check_caps queuing invalidate\n");
+			doutc(cl, "queuing invalidate\n");
 			queue_invalidate = true;
 			ci->i_rdcache_revoking = ci->i_rdcache_gen;
 		}
@@ -2097,35 +2149,35 @@ retry:
 			cap_used &= ~ci->i_auth_cap->issued;
 
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-		     cap->mds, cap, ceph_cap_string(cap_used),
-		     ceph_cap_string(cap->issued),
-		     ceph_cap_string(cap->implemented),
-		     ceph_cap_string(revoking));
+		doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n",
+		      cap->mds, cap, ceph_cap_string(cap_used),
+		      ceph_cap_string(cap->issued),
+		      ceph_cap_string(cap->implemented),
+		      ceph_cap_string(revoking));
 
 		if (cap == ci->i_auth_cap &&
 		    (cap->issued & CEPH_CAP_FILE_WR)) {
 			/* request larger max_size from MDS? */
 			if (ci->i_wanted_max_size > ci->i_max_size &&
 			    ci->i_wanted_max_size > ci->i_requested_max_size) {
-				dout("requesting new max_size\n");
+				doutc(cl, "requesting new max_size\n");
 				goto ack;
 			}
 
 			/* approaching file_max? */
 			if (__ceph_should_report_size(ci)) {
-				dout("i_size approaching max_size\n");
+				doutc(cl, "i_size approaching max_size\n");
 				goto ack;
 			}
 		}
 		/* flush anything dirty? */
 		if (cap == ci->i_auth_cap) {
 			if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
-				dout("flushing dirty caps\n");
+				doutc(cl, "flushing dirty caps\n");
 				goto ack;
 			}
 			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
-				dout("flushing snap caps\n");
+				doutc(cl, "flushing snap caps\n");
 				goto ack;
 			}
 		}
@@ -2133,7 +2185,7 @@ retry:
 		/* completed revocation? going down and there are no caps? */
 		if (revoking) {
 			if ((revoking & cap_used) == 0) {
-				dout("completed revocation of %s\n",
+				doutc(cl, "completed revocation of %s\n",
 				      ceph_cap_string(cap->implemented & ~cap->issued));
 				goto ack;
 			}
@@ -2232,7 +2284,7 @@ ack:
  */
 static int try_flush_caps(struct inode *inode, u64 *ptid)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing = 0;
 	u64 flush_tid = 0, oldest_flush_tid = 0;
@@ -2310,7 +2362,8 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  */
 static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
 	int ret, err = 0;
@@ -2400,8 +2453,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
 		kfree(sessions);
 	}
 
-	dout("%s %p wait on tid %llu %llu\n", __func__,
-	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+	doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode,
+	      ceph_vinop(inode), req1 ? req1->r_tid : 0ULL,
+	      req2 ? req2->r_tid : 0ULL);
 	if (req1) {
 		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
 					ceph_timeout_jiffies(req1->r_timeout));
@@ -2427,11 +2481,13 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	u64 flush_tid;
 	int ret, err;
 	int dirty;
 
-	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode),
+	      datasync ? " datasync" : "");
 
 	ret = file_write_and_wait_range(file, start, end);
 	if (datasync)
@@ -2442,7 +2498,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 
 	dirty = try_flush_caps(inode, &flush_tid);
-	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+	doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty));
 
 	err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
 
@@ -2463,7 +2519,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (err < 0)
 		ret = err;
 out:
-	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
+	doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode),
+	      datasync ? " datasync" : "", ret);
 	return ret;
 }
 
@@ -2476,12 +2533,13 @@ out:
 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	u64 flush_tid;
 	int err = 0;
 	int dirty;
 	int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
 
-	dout("write_inode %p wait=%d\n", inode, wait);
+	doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait);
 	ceph_fscache_unpin_writeback(inode, wbc);
 	if (wait) {
 		err = ceph_wait_on_async_create(inode);
@@ -2493,7 +2551,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 				       caps_are_flushed(inode, flush_tid));
 	} else {
 		struct ceph_mds_client *mdsc =
-			ceph_sb_to_client(inode->i_sb)->mdsc;
+			ceph_sb_to_fs_client(inode->i_sb)->mdsc;
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_caps_dirty(ci))
@@ -2511,6 +2569,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 	__acquires(ci->i_ceph_lock)
 {
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap *cap;
 	struct ceph_cap_flush *cf;
 	int ret;
@@ -2536,8 +2595,8 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 
 		cap = ci->i_auth_cap;
 		if (!(cap && cap->session == session)) {
-			pr_err("%p auth cap %p not mds%d ???\n",
-			       inode, cap, session->s_mds);
+			pr_err_client(cl, "%p auth cap %p not mds%d ???\n",
+				      inode, cap, session->s_mds);
 			break;
 		}
 
@@ -2546,8 +2605,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 		if (!cf->is_capsnap) {
 			struct cap_msg_args arg;
 
-			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
-			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
+			doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n",
+			      inode, ceph_vinop(inode), cap, cf->tid,
+			      ceph_cap_string(cf->caps));
 			__prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
 					 (cf->tid < last_snap_flush ?
 					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
@@ -2561,9 +2621,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			struct ceph_cap_snap *capsnap =
 					container_of(cf, struct ceph_cap_snap,
 						    cap_flush);
-			dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
-			     inode, capsnap, cf->tid,
-			     ceph_cap_string(capsnap->dirty));
+			doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n",
+			      inode, ceph_vinop(inode), capsnap, cf->tid,
+			      ceph_cap_string(capsnap->dirty));
 
 			refcount_inc(&capsnap->nref);
 			spin_unlock(&ci->i_ceph_lock);
@@ -2571,11 +2631,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
 						oldest_flush_tid);
 			if (ret < 0) {
-				pr_err("kick_flushing_caps: error sending "
-					"cap flushsnap, ino (%llx.%llx) "
-					"tid %llu follows %llu\n",
-					ceph_vinop(inode), cf->tid,
-					capsnap->follows);
+				pr_err_client(cl, "error sending cap flushsnap,"
+					      " %p %llx.%llx tid %llu follows %llu\n",
+					      inode, ceph_vinop(inode), cf->tid,
+					      capsnap->follows);
 			}
 
 			ceph_put_cap_snap(capsnap);
@@ -2588,22 +2647,26 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	u64 oldest_flush_tid;
 
-	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+	doutc(cl, "mds%d\n", session->s_mds);
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 	spin_unlock(&mdsc->cap_dirty_lock);
 
 	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->netfs.inode;
+
 		spin_lock(&ci->i_ceph_lock);
 		cap = ci->i_auth_cap;
 		if (!(cap && cap->session == session)) {
-			pr_err("%p auth cap %p not mds%d ???\n",
-				&ci->netfs.inode, cap, session->s_mds);
+			pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+				      inode, ceph_vinop(inode), cap,
+				      session->s_mds);
 			spin_unlock(&ci->i_ceph_lock);
 			continue;
 		}
@@ -2636,24 +2699,28 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
 	u64 oldest_flush_tid;
 
 	lockdep_assert_held(&session->s_mutex);
 
-	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	doutc(cl, "mds%d\n", session->s_mds);
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 	spin_unlock(&mdsc->cap_dirty_lock);
 
 	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->netfs.inode;
+
 		spin_lock(&ci->i_ceph_lock);
 		cap = ci->i_auth_cap;
 		if (!(cap && cap->session == session)) {
-			pr_err("%p auth cap %p not mds%d ???\n",
-				&ci->netfs.inode, cap, session->s_mds);
+			pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n",
+				      inode, ceph_vinop(inode), cap,
+				      session->s_mds);
 			spin_unlock(&ci->i_ceph_lock);
 			continue;
 		}
@@ -2670,11 +2737,13 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_cap *cap = ci->i_auth_cap;
+	struct inode *inode = &ci->netfs.inode;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
-	     ceph_cap_string(ci->i_flushing_caps));
+	doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n",
+	      inode, ceph_vinop(inode),
+	      ceph_cap_string(ci->i_flushing_caps));
 
 	if (!list_empty(&ci->i_cap_flush_list)) {
 		u64 oldest_flush_tid;
@@ -2696,6 +2765,9 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
 void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
 			    bool snap_rwsem_locked)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+
 	lockdep_assert_held(&ci->i_ceph_lock);
 
 	if (got & CEPH_CAP_PIN)
@@ -2716,10 +2788,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
 	}
 	if (got & CEPH_CAP_FILE_BUFFER) {
 		if (ci->i_wb_ref == 0)
-			ihold(&ci->netfs.inode);
+			ihold(inode);
 		ci->i_wb_ref++;
-		dout("%s %p wb %d -> %d (?)\n", __func__,
-		     &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
+		doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+		      ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref);
 	}
 }
 
@@ -2746,20 +2818,23 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			    loff_t endoff, int flags, int *got)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int ret = 0;
 	int have, implemented;
 	bool snap_rwsem_locked = false;
 
-	dout("get_cap_refs %p need %s want %s\n", inode,
-	     ceph_cap_string(need), ceph_cap_string(want));
+	doutc(cl, "%p %llx.%llx need %s want %s\n", inode,
+	      ceph_vinop(inode), ceph_cap_string(need),
+	      ceph_cap_string(want));
 
 again:
 	spin_lock(&ci->i_ceph_lock);
 
 	if ((flags & CHECK_FILELOCK) &&
 	    (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
-		dout("try_get_cap_refs %p error filelock\n", inode);
+		doutc(cl, "%p %llx.%llx error filelock\n", inode,
+		      ceph_vinop(inode));
 		ret = -EIO;
 		goto out_unlock;
 	}
@@ -2779,8 +2854,8 @@ again:
 
 	if (have & need & CEPH_CAP_FILE_WR) {
 		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
-			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
-			     inode, endoff, ci->i_max_size);
+			doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n",
+			      inode, ceph_vinop(inode), endoff, ci->i_max_size);
 			if (endoff > ci->i_requested_max_size)
 				ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
 			goto out_unlock;
@@ -2790,7 +2865,8 @@ again:
 		 * can get a final snapshot value for size+mtime.
 		 */
 		if (__ceph_have_pending_cap_snap(ci)) {
-			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode,
+			      ceph_vinop(inode));
 			goto out_unlock;
 		}
 	}
@@ -2808,9 +2884,9 @@ again:
 		int not = want & ~(have & need);
 		int revoking = implemented & ~have;
 		int exclude = revoking & not;
-		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
-		     inode, ceph_cap_string(have), ceph_cap_string(not),
-		     ceph_cap_string(revoking));
+		doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n",
+		      inode, ceph_vinop(inode), ceph_cap_string(have),
+		      ceph_cap_string(not), ceph_cap_string(revoking));
 		if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) {
 			if (!snap_rwsem_locked &&
 			    !ci->i_head_snapc &&
@@ -2850,28 +2926,31 @@ again:
 			spin_unlock(&s->s_cap_lock);
 		}
 		if (session_readonly) {
-			dout("get_cap_refs %p need %s but mds%d readonly\n",
-			     inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+			doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n",
+			      inode, ceph_vinop(inode), ceph_cap_string(need),
+			      ci->i_auth_cap->mds);
 			ret = -EROFS;
 			goto out_unlock;
 		}
 
 		if (ceph_inode_is_shutdown(inode)) {
-			dout("get_cap_refs %p inode is shutdown\n", inode);
+			doutc(cl, "%p %llx.%llx inode is shutdown\n",
+			      inode, ceph_vinop(inode));
 			ret = -ESTALE;
 			goto out_unlock;
 		}
 		mds_wanted = __ceph_caps_mds_wanted(ci, false);
 		if (need & ~mds_wanted) {
-			dout("get_cap_refs %p need %s > mds_wanted %s\n",
-			     inode, ceph_cap_string(need),
-			     ceph_cap_string(mds_wanted));
+			doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n",
+			      inode, ceph_vinop(inode), ceph_cap_string(need),
+			      ceph_cap_string(mds_wanted));
 			ret = -EUCLEAN;
 			goto out_unlock;
 		}
 
-		dout("get_cap_refs %p have %s need %s\n", inode,
-		     ceph_cap_string(have), ceph_cap_string(need));
+		doutc(cl, "%p %llx.%llx have %s need %s\n", inode,
+		      ceph_vinop(inode), ceph_cap_string(have),
+		      ceph_cap_string(need));
 	}
 out_unlock:
 
@@ -2886,8 +2965,8 @@ out_unlock:
 	else if (ret == 1)
 		ceph_update_cap_hit(&mdsc->metric);
 
-	dout("get_cap_refs %p ret %d got %s\n", inode,
-	     ret, ceph_cap_string(*got));
+	doutc(cl, "%p %llx.%llx ret %d got %s\n", inode,
+	      ceph_vinop(inode), ret, ceph_cap_string(*got));
 	return ret;
 }
 
@@ -2899,13 +2978,14 @@ out_unlock:
 static void check_max_size(struct inode *inode, loff_t endoff)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int check = 0;
 
 	/* do we need to explicitly request a larger max_size? */
 	spin_lock(&ci->i_ceph_lock);
 	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
-		dout("write %p at large endoff %llu, req max_size\n",
-		     inode, endoff);
+		doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n",
+		      inode, ceph_vinop(inode), endoff);
 		ci->i_wanted_max_size = endoff;
 	}
 	/* duplicate ceph_check_caps()'s logic */
@@ -2964,7 +3044,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
 		    int want, loff_t endoff, int *got)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	int ret, _got, flags;
 
 	ret = ceph_pool_perm_check(inode, need);
@@ -3115,10 +3195,12 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
 				  struct ceph_cap_snap *capsnap)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+
 	if (!capsnap->need_flush &&
 	    !capsnap->writing && !capsnap->dirty_pages) {
-		dout("dropping cap_snap %p follows %llu\n",
-		     capsnap, capsnap->follows);
+		doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows);
 		BUG_ON(capsnap->cap_flush.tid > 0);
 		ceph_put_snap_context(capsnap->context);
 		if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
@@ -3150,6 +3232,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 				enum put_cap_refs_mode mode)
 {
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int last = 0, put = 0, flushsnaps = 0, wake = 0;
 	bool check_flushsnaps = false;
 
@@ -3172,8 +3255,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 			put++;
 			check_flushsnaps = true;
 		}
-		dout("put_cap_refs %p wb %d -> %d (?)\n",
-		     inode, ci->i_wb_ref+1, ci->i_wb_ref);
+		doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode,
+		      ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref);
 	}
 	if (had & CEPH_CAP_FILE_WR) {
 		if (--ci->i_wr_ref == 0) {
@@ -3213,8 +3296,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
-	     last ? " last" : "", put ? " put" : "");
+	doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode),
+	      ceph_cap_string(had), last ? " last" : "", put ? " put" : "");
 
 	switch (mode) {
 	case PUT_CAP_REFS_SYNC:
@@ -3264,6 +3347,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				struct ceph_snap_context *snapc)
 {
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_cap_snap *capsnap = NULL, *iter;
 	int put = 0;
 	bool last = false;
@@ -3287,11 +3371,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 			ceph_put_snap_context(ci->i_head_snapc);
 			ci->i_head_snapc = NULL;
 		}
-		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
-		     inode,
-		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
-		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
-		     last ? " LAST" : "");
+		doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n",
+		      inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr,
+		      ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref,
+		      ci->i_wrbuffer_ref_head, last ? " LAST" : "");
 	} else {
 		list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
 			if (iter->context == snapc) {
@@ -3321,13 +3404,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				}
 			}
 		}
-		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-		     " snap %lld %d/%d -> %d/%d %s%s\n",
-		     inode, capsnap, capsnap->context->seq,
-		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
-		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
-		     last ? " (wrbuffer last)" : "",
-		     complete_capsnap ? " (complete capsnap)" : "");
+		doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n",
+		      inode, ceph_vinop(inode), capsnap, capsnap->context->seq,
+		      ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		      ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		      last ? " (wrbuffer last)" : "",
+		      complete_capsnap ? " (complete capsnap)" : "");
 	}
 
 unlock:
@@ -3350,9 +3432,10 @@ unlock:
  */
 static void invalidate_aliases(struct inode *inode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct dentry *dn, *prev = NULL;
 
-	dout("invalidate_aliases inode %p\n", inode);
+	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	d_prune_aliases(inode);
 	/*
 	 * For non-directory inode, d_find_alias() only returns
@@ -3411,6 +3494,7 @@ static void handle_cap_grant(struct inode *inode,
 	__releases(ci->i_ceph_lock)
 	__releases(session->s_mdsc->snap_rwsem)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int seq = le32_to_cpu(grant->seq);
 	int newcaps = le32_to_cpu(grant->caps);
@@ -3434,10 +3518,11 @@ static void handle_cap_grant(struct inode *inode,
 	if (IS_ENCRYPTED(inode) && size)
 		size = extra_info->fscrypt_file_size;
 
-	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
-	     inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
-	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
-		i_size_read(inode));
+	doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode,
+	      ceph_vinop(inode), cap, session->s_mds, seq,
+	      ceph_cap_string(newcaps));
+	doutc(cl, " size %llu max_size %llu, i_size %llu\n", size,
+	      max_size, i_size_read(inode));
 
 
 	/*
@@ -3497,15 +3582,17 @@ static void handle_cap_grant(struct inode *inode,
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
 		ci->i_btime = extra_info->btime;
-		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     from_kuid(&init_user_ns, inode->i_uid),
-		     from_kgid(&init_user_ns, inode->i_gid));
+		doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+		      ceph_vinop(inode), inode->i_mode,
+		      from_kuid(&init_user_ns, inode->i_uid),
+		      from_kgid(&init_user_ns, inode->i_gid));
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 		if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
 		    memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
 			   ci->fscrypt_auth_len))
-			pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
-				__func__, ci->fscrypt_auth_len,
+			pr_warn_ratelimited_client(cl,
+				"cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+				ci->fscrypt_auth_len,
 				extra_info->fscrypt_auth_len);
 #endif
 	}
@@ -3523,8 +3610,8 @@ static void handle_cap_grant(struct inode *inode,
 		u64 version = le64_to_cpu(grant->xattr_version);
 
 		if (version > ci->i_xattrs.version) {
-			dout(" got new xattrs v%llu on %p len %d\n",
-			     version, inode, len);
+			doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n",
+			      version, inode, ceph_vinop(inode), len);
 			if (ci->i_xattrs.blob)
 				ceph_buffer_put(ci->i_xattrs.blob);
 			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
@@ -3575,8 +3662,8 @@ static void handle_cap_grant(struct inode *inode,
 
 	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
 		if (max_size != ci->i_max_size) {
-			dout("max_size %lld -> %llu\n",
-			     ci->i_max_size, max_size);
+			doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size,
+			      max_size);
 			ci->i_max_size = max_size;
 			if (max_size >= ci->i_wanted_max_size) {
 				ci->i_wanted_max_size = 0;  /* reset */
@@ -3590,10 +3677,9 @@ static void handle_cap_grant(struct inode *inode,
 	wanted = __ceph_caps_wanted(ci);
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
-	dout(" my wanted = %s, used = %s, dirty %s\n",
-	     ceph_cap_string(wanted),
-	     ceph_cap_string(used),
-	     ceph_cap_string(dirty));
+	doutc(cl, " my wanted = %s, used = %s, dirty %s\n",
+	      ceph_cap_string(wanted), ceph_cap_string(used),
+	      ceph_cap_string(dirty));
 
 	if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
 	    (wanted & ~(cap->mds_wanted | newcaps))) {
@@ -3614,10 +3700,9 @@ static void handle_cap_grant(struct inode *inode,
 	if (cap->issued & ~newcaps) {
 		int revoking = cap->issued & ~newcaps;
 
-		dout("revocation: %s -> %s (revoking %s)\n",
-		     ceph_cap_string(cap->issued),
-		     ceph_cap_string(newcaps),
-		     ceph_cap_string(revoking));
+		doutc(cl, "revocation: %s -> %s (revoking %s)\n",
+		      ceph_cap_string(cap->issued), ceph_cap_string(newcaps),
+		      ceph_cap_string(revoking));
 		if (S_ISREG(inode->i_mode) &&
 		    (revoking & used & CEPH_CAP_FILE_BUFFER))
 			writeback = true;  /* initiate writeback; will delay ack */
@@ -3635,11 +3720,12 @@ static void handle_cap_grant(struct inode *inode,
 		cap->issued = newcaps;
 		cap->implemented |= newcaps;
 	} else if (cap->issued == newcaps) {
-		dout("caps unchanged: %s -> %s\n",
-		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+		doutc(cl, "caps unchanged: %s -> %s\n",
+		      ceph_cap_string(cap->issued),
+		      ceph_cap_string(newcaps));
 	} else {
-		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
-		     ceph_cap_string(newcaps));
+		doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		      ceph_cap_string(newcaps));
 		/* non-auth MDS is revoking the newly grant caps ? */
 		if (cap == ci->i_auth_cap &&
 		    __ceph_caps_revoking_other(ci, cap, newcaps))
@@ -3727,7 +3813,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 	__releases(ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap_flush *cf, *tmp_cf;
 	LIST_HEAD(to_remove);
 	unsigned seq = le32_to_cpu(m->seq);
@@ -3764,11 +3851,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 		}
 	}
 
-	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
-	     " flushing %s -> %s\n",
-	     inode, session->s_mds, seq, ceph_cap_string(dirty),
-	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
-	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+	doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n",
+	      inode, ceph_vinop(inode), session->s_mds, seq,
+	      ceph_cap_string(dirty), ceph_cap_string(cleaned),
+	      ceph_cap_string(ci->i_flushing_caps),
+	      ceph_cap_string(ci->i_flushing_caps & ~cleaned));
 
 	if (list_empty(&to_remove) && !cleaned)
 		goto out;
@@ -3784,18 +3871,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 		if (list_empty(&ci->i_cap_flush_list)) {
 			list_del_init(&ci->i_flushing_item);
 			if (!list_empty(&session->s_cap_flushing)) {
-				dout(" mds%d still flushing cap on %p\n",
-				     session->s_mds,
-				     &list_first_entry(&session->s_cap_flushing,
-						struct ceph_inode_info,
-						i_flushing_item)->netfs.inode);
+				struct inode *inode =
+					    &list_first_entry(&session->s_cap_flushing,
+							      struct ceph_inode_info,
+							      i_flushing_item)->netfs.inode;
+				doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n",
+				      session->s_mds, inode, ceph_vinop(inode));
 			}
 		}
 		mdsc->num_cap_flushing--;
-		dout(" inode %p now !flushing\n", inode);
+		doutc(cl, " %p %llx.%llx now !flushing\n", inode,
+		      ceph_vinop(inode));
 
 		if (ci->i_dirty_caps == 0) {
-			dout(" inode %p now clean\n", inode);
+			doutc(cl, " %p %llx.%llx now clean\n", inode,
+			      ceph_vinop(inode));
 			BUG_ON(!list_empty(&ci->i_dirty_item));
 			drop = true;
 			if (ci->i_wr_ref == 0 &&
@@ -3833,12 +3923,14 @@ void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
 			   bool *wake_ci, bool *wake_mdsc)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	bool ret;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
+	doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap,
+	      inode, ceph_vinop(inode), ci);
 
 	list_del_init(&capsnap->ci_item);
 	ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
@@ -3877,29 +3969,31 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 				     struct ceph_mds_session *session)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	u64 follows = le64_to_cpu(m->snap_follows);
 	struct ceph_cap_snap *capsnap = NULL, *iter;
 	bool wake_ci = false;
 	bool wake_mdsc = false;
 
-	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
-	     inode, ci, session->s_mds, follows);
+	doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode,
+	      ceph_vinop(inode), ci, session->s_mds, follows);
 
 	spin_lock(&ci->i_ceph_lock);
 	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
 		if (iter->follows == follows) {
 			if (iter->cap_flush.tid != flush_tid) {
-				dout(" cap_snap %p follows %lld tid %lld !="
-				     " %lld\n", iter, follows,
-				     flush_tid, iter->cap_flush.tid);
+				doutc(cl, " cap_snap %p follows %lld "
+				      "tid %lld != %lld\n", iter,
+				      follows, flush_tid,
+				      iter->cap_flush.tid);
 				break;
 			}
 			capsnap = iter;
 			break;
 		} else {
-			dout(" skipping cap_snap %p follows %lld\n",
-			     iter, iter->follows);
+			doutc(cl, " skipping cap_snap %p follows %lld\n",
+			      iter, iter->follows);
 		}
 	}
 	if (capsnap)
@@ -3928,6 +4022,7 @@ static bool handle_cap_trunc(struct inode *inode,
 			     struct cap_extra_info *extra_info)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int mds = session->s_mds;
 	int seq = le32_to_cpu(trunc->seq);
 	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
@@ -3950,8 +4045,8 @@ static bool handle_cap_trunc(struct inode *inode,
 	if (IS_ENCRYPTED(inode) && size)
 		size = extra_info->fscrypt_file_size;
 
-	dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
-	     __func__, inode, mds, seq, truncate_size, truncate_seq);
+	doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n",
+	      inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq);
 	queue_trunc = ceph_fill_file_size(inode, issued,
 					  truncate_seq, truncate_size, size);
 	return queue_trunc;
@@ -3969,7 +4064,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			      struct ceph_mds_cap_peer *ph,
 			      struct ceph_mds_session *session)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_session *tsession = NULL;
 	struct ceph_cap *cap, *tcap, *new_cap = NULL;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -3989,8 +4085,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 		target = -1;
 	}
 
-	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
-	     inode, ci, mds, mseq, target);
+	doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n",
+	      inode, ceph_vinop(inode), ci, mds, mseq, target);
 retry:
 	down_read(&mdsc->snap_rwsem);
 	spin_lock(&ci->i_ceph_lock);
@@ -3999,7 +4095,7 @@ retry:
 		goto out_unlock;
 
 	if (target < 0) {
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	}
 
@@ -4010,12 +4106,13 @@ retry:
 
 	issued = cap->issued;
 	if (issued != cap->implemented)
-		pr_err_ratelimited("handle_cap_export: issued != implemented: "
-				"ino (%llx.%llx) mds%d seq %d mseq %d "
-				"issued %s implemented %s\n",
-				ceph_vinop(inode), mds, cap->seq, cap->mseq,
-				ceph_cap_string(issued),
-				ceph_cap_string(cap->implemented));
+		pr_err_ratelimited_client(cl, "issued != implemented: "
+					  "%p %llx.%llx mds%d seq %d mseq %d"
+					  " issued %s implemented %s\n",
+					  inode, ceph_vinop(inode), mds,
+					  cap->seq, cap->mseq,
+					  ceph_cap_string(issued),
+					  ceph_cap_string(cap->implemented));
 
 
 	tcap = __get_cap_for_mds(ci, target);
@@ -4023,7 +4120,8 @@ retry:
 		/* already have caps from the target */
 		if (tcap->cap_id == t_cap_id &&
 		    ceph_seq_cmp(tcap->seq, t_seq) < 0) {
-			dout(" updating import cap %p mds%d\n", tcap, target);
+			doutc(cl, " updating import cap %p mds%d\n", tcap,
+			      target);
 			tcap->cap_id = t_cap_id;
 			tcap->seq = t_seq - 1;
 			tcap->issue_seq = t_seq - 1;
@@ -4034,7 +4132,7 @@ retry:
 				change_auth_cap_ses(ci, tcap->session);
 			}
 		}
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	} else if (tsession) {
 		/* add placeholder for the export tagert */
@@ -4051,7 +4149,7 @@ retry:
 			spin_unlock(&mdsc->cap_dirty_lock);
 		}
 
-		ceph_remove_cap(cap, false);
+		ceph_remove_cap(mdsc, cap, false);
 		goto out_unlock;
 	}
 
@@ -4104,6 +4202,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 			      struct ceph_cap **target_cap, int *old_issued)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap *cap, *ocap, *new_cap = NULL;
 	int mds = session->s_mds;
 	int issued;
@@ -4124,8 +4223,8 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 		peer = -1;
 	}
 
-	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
-	     inode, ci, mds, mseq, peer);
+	doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n",
+	      inode, ceph_vinop(inode), ci, mds, mseq, peer);
 retry:
 	cap = __get_cap_for_mds(ci, mds);
 	if (!cap) {
@@ -4151,20 +4250,20 @@ retry:
 
 	ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
 	if (ocap && ocap->cap_id == p_cap_id) {
-		dout(" remove export cap %p mds%d flags %d\n",
-		     ocap, peer, ph->flags);
+		doutc(cl, " remove export cap %p mds%d flags %d\n",
+		      ocap, peer, ph->flags);
 		if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
 		    (ocap->seq != le32_to_cpu(ph->seq) ||
 		     ocap->mseq != le32_to_cpu(ph->mseq))) {
-			pr_err_ratelimited("handle_cap_import: "
-					"mismatched seq/mseq: ino (%llx.%llx) "
-					"mds%d seq %d mseq %d importer mds%d "
-					"has peer seq %d mseq %d\n",
-					ceph_vinop(inode), peer, ocap->seq,
-					ocap->mseq, mds, le32_to_cpu(ph->seq),
+			pr_err_ratelimited_client(cl, "mismatched seq/mseq: "
+					"%p %llx.%llx mds%d seq %d mseq %d"
+					" importer mds%d has peer seq %d mseq %d\n",
+					inode, ceph_vinop(inode), peer,
+					ocap->seq, ocap->mseq, mds,
+					le32_to_cpu(ph->seq),
 					le32_to_cpu(ph->mseq));
 		}
-		ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+		ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
 	}
 
 	*old_issued = issued;
@@ -4227,6 +4326,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_cap *cap;
@@ -4245,7 +4345,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	bool close_sessions = false;
 	bool do_cap_release = false;
 
-	dout("handle_caps from mds%d\n", session->s_mds);
+	doutc(cl, "from mds%d\n", session->s_mds);
 
 	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
 		return;
@@ -4347,15 +4447,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 	/* lookup ino */
 	inode = ceph_find_inode(mdsc->fsc->sb, vino);
-	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
-	     vino.snap, inode);
+	doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op),
+	      vino.ino, vino.snap, inode);
 
 	mutex_lock(&session->s_mutex);
-	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
-	     (unsigned)seq);
+	doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds,
+	      session->s_seq, (unsigned)seq);
 
 	if (!inode) {
-		dout(" i don't have ino %llx\n", vino.ino);
+		doutc(cl, " i don't have ino %llx\n", vino.ino);
 
 		switch (op) {
 		case CEPH_CAP_OP_IMPORT:
@@ -4410,9 +4510,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	spin_lock(&ci->i_ceph_lock);
 	cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
 	if (!cap) {
-		dout(" no cap on %p ino %llx.%llx from mds%d\n",
-		     inode, ceph_ino(inode), ceph_snap(inode),
-		     session->s_mds);
+		doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n",
+		      inode, ceph_ino(inode), ceph_snap(inode),
+		      session->s_mds);
 		spin_unlock(&ci->i_ceph_lock);
 		switch (op) {
 		case CEPH_CAP_OP_REVOKE:
@@ -4450,8 +4550,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 	default:
 		spin_unlock(&ci->i_ceph_lock);
-		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
-		       ceph_cap_op_name(op));
+		pr_err_client(cl, "unknown cap op %d %s\n", op,
+			      ceph_cap_op_name(op));
 	}
 
 done:
@@ -4492,7 +4592,7 @@ flush_cap_releases:
 	goto done;
 
 bad:
-	pr_err("ceph_handle_caps: corrupt message\n");
+	pr_err_client(cl, "corrupt message\n");
 	ceph_msg_dump(msg);
 	goto out;
 }
@@ -4506,6 +4606,7 @@ bad:
  */
 unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
@@ -4513,14 +4614,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 	unsigned long loop_start = jiffies;
 	unsigned long delay = 0;
 
-	dout("check_delayed_caps\n");
+	doutc(cl, "begin\n");
 	spin_lock(&mdsc->cap_delay_lock);
 	while (!list_empty(&mdsc->cap_delay_list)) {
 		ci = list_first_entry(&mdsc->cap_delay_list,
 				      struct ceph_inode_info,
 				      i_cap_delay_list);
 		if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
-			dout("%s caps added recently.  Exiting loop", __func__);
+			doutc(cl, "caps added recently.  Exiting loop");
 			delay = ci->i_hold_caps_max;
 			break;
 		}
@@ -4532,13 +4633,15 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 		inode = igrab(&ci->netfs.inode);
 		if (inode) {
 			spin_unlock(&mdsc->cap_delay_lock);
-			dout("check_delayed_caps on %p\n", inode);
+			doutc(cl, "on %p %llx.%llx\n", inode,
+			      ceph_vinop(inode));
 			ceph_check_caps(ci, 0);
 			iput(inode);
 			spin_lock(&mdsc->cap_delay_lock);
 		}
 	}
 	spin_unlock(&mdsc->cap_delay_lock);
+	doutc(cl, "done\n");
 
 	return delay;
 }
@@ -4549,17 +4652,18 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 static void flush_dirty_session_caps(struct ceph_mds_session *s)
 {
 	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct inode *inode;
 
-	dout("flush_dirty_caps\n");
+	doutc(cl, "begin\n");
 	spin_lock(&mdsc->cap_dirty_lock);
 	while (!list_empty(&s->s_cap_dirty)) {
 		ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
 				      i_dirty_item);
 		inode = &ci->netfs.inode;
 		ihold(inode);
-		dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
+		doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 		spin_unlock(&mdsc->cap_dirty_lock);
 		ceph_wait_on_async_create(inode);
 		ceph_check_caps(ci, CHECK_CAPS_FLUSH);
@@ -4567,7 +4671,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
 		spin_lock(&mdsc->cap_dirty_lock);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
-	dout("flush_dirty_caps done\n");
+	doutc(cl, "done\n");
 }
 
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
@@ -4672,7 +4776,7 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
 
 		if (__ceph_caps_dirty(ci)) {
 			struct ceph_mds_client *mdsc =
-				ceph_inode_to_client(inode)->mdsc;
+				ceph_inode_to_fs_client(inode)->mdsc;
 			__cap_delay_requeue_front(mdsc, ci);
 		}
 	}
@@ -4692,6 +4796,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			      int mds, int drop, int unless, int force)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_cap *cap;
 	struct ceph_mds_request_release *rel = *p;
 	int used, dirty;
@@ -4701,9 +4806,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 	used = __ceph_caps_used(ci);
 	dirty = __ceph_caps_dirty(ci);
 
-	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
-	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
-	     ceph_cap_string(unless));
+	doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n",
+	      inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty),
+	      ceph_cap_string(drop), ceph_cap_string(unless));
 
 	/* only drop unused, clean caps */
 	drop &= ~(used | dirty);
@@ -4725,12 +4830,13 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 		if (force || (cap->issued & drop)) {
 			if (cap->issued & drop) {
 				int wanted = __ceph_caps_wanted(ci);
-				dout("encode_inode_release %p cap %p "
-				     "%s -> %s, wanted %s -> %s\n", inode, cap,
-				     ceph_cap_string(cap->issued),
-				     ceph_cap_string(cap->issued & ~drop),
-				     ceph_cap_string(cap->mds_wanted),
-				     ceph_cap_string(wanted));
+				doutc(cl, "%p %llx.%llx cap %p %s -> %s, "
+				      "wanted %s -> %s\n", inode,
+				      ceph_vinop(inode), cap,
+				      ceph_cap_string(cap->issued),
+				      ceph_cap_string(cap->issued & ~drop),
+				      ceph_cap_string(cap->mds_wanted),
+				      ceph_cap_string(wanted));
 
 				cap->issued &= ~drop;
 				cap->implemented &= ~drop;
@@ -4739,9 +4845,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 				    !(wanted & CEPH_CAP_ANY_FILE_WR))
 					ci->i_requested_max_size = 0;
 			} else {
-				dout("encode_inode_release %p cap %p %s"
-				     " (force)\n", inode, cap,
-				     ceph_cap_string(cap->issued));
+				doutc(cl, "%p %llx.%llx cap %p %s (force)\n",
+				      inode, ceph_vinop(inode), cap,
+				      ceph_cap_string(cap->issued));
 			}
 
 			rel->ino = cpu_to_le64(ceph_ino(inode));
@@ -4756,8 +4862,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			*p += sizeof(*rel);
 			ret = 1;
 		} else {
-			dout("encode_inode_release %p cap %p %s (noop)\n",
-			     inode, cap, ceph_cap_string(cap->issued));
+			doutc(cl, "%p %llx.%llx cap %p %s (noop)\n",
+			      inode, ceph_vinop(inode), cap,
+			      ceph_cap_string(cap->issued));
 		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
@@ -4783,6 +4890,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 	struct dentry *parent = NULL;
 	struct ceph_mds_request_release *rel = *p;
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	struct ceph_client *cl;
 	int force = 0;
 	int ret;
 
@@ -4804,10 +4912,11 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
 	dput(parent);
 
+	cl = ceph_inode_to_client(dir);
 	spin_lock(&dentry->d_lock);
 	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
-		dout("encode_dentry_release %p mds%d seq %d\n",
-		     dentry, mds, (int)di->lease_seq);
+		doutc(cl, "%p mds%d seq %d\n",  dentry, mds,
+		      (int)di->lease_seq);
 		rel->dname_seq = cpu_to_le32(di->lease_seq);
 		__ceph_mdsc_drop_dentry_lease(dentry);
 		spin_unlock(&dentry->d_lock);
@@ -4833,12 +4942,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_cap_snap *capsnap;
 	int capsnap_release = 0;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+	doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n",
+	      ci, inode, ceph_vinop(inode));
 
 	while (!list_empty(&ci->i_cap_snaps)) {
 		capsnap = list_first_entry(&ci->i_cap_snaps,
@@ -4855,8 +4966,9 @@ static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
 
 int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
 {
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_client *cl = fsc->client;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool is_auth;
 	bool dirty_dropped = false;
@@ -4864,8 +4976,8 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	dout("removing cap %p, ci is %p, inode is %p\n",
-	     cap, ci, &ci->netfs.inode);
+	doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n",
+	      cap, ci, inode, ceph_vinop(inode));
 
 	is_auth = (cap == ci->i_auth_cap);
 	__ceph_remove_cap(cap, false);
@@ -4892,19 +5004,19 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
 		}
 
 		if (!list_empty(&ci->i_dirty_item)) {
-			pr_warn_ratelimited(
-				" dropping dirty %s state for %p %lld\n",
+			pr_warn_ratelimited_client(cl,
+				" dropping dirty %s state for %p %llx.%llx\n",
 				ceph_cap_string(ci->i_dirty_caps),
-				inode, ceph_ino(inode));
+				inode, ceph_vinop(inode));
 			ci->i_dirty_caps = 0;
 			list_del_init(&ci->i_dirty_item);
 			dirty_dropped = true;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
-			pr_warn_ratelimited(
-				" dropping dirty+flushing %s state for %p %lld\n",
+			pr_warn_ratelimited_client(cl,
+				" dropping dirty+flushing %s state for %p %llx.%llx\n",
 				ceph_cap_string(ci->i_flushing_caps),
-				inode, ceph_ino(inode));
+				inode, ceph_vinop(inode));
 			ci->i_flushing_caps = 0;
 			list_del_init(&ci->i_flushing_item);
 			mdsc->num_cap_flushing--;
@@ -4927,8 +5039,9 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
 		if (atomic_read(&ci->i_filelock_ref) > 0) {
 			/* make further file lock syscall return -EIO */
 			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
-			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
-					    inode, ceph_ino(inode));
+			pr_warn_ratelimited_client(cl,
+				" dropping file locks for %p %llx.%llx\n",
+				inode, ceph_vinop(inode));
 		}
 
 		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 5b5112c78462..3b3c4d8d401e 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -113,7 +113,7 @@ static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
 
 	cia.fscrypt_auth = cfa;
 
-	ret = __ceph_setattr(inode, &attr, &cia);
+	ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia);
 	if (ret == 0)
 		inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
 	kfree(cia.fscrypt_auth);
@@ -129,10 +129,11 @@ static bool ceph_crypt_empty_dir(struct inode *inode)
 
 static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
 {
-	return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy;
+	return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy;
 }
 
 static struct fscrypt_operations ceph_fscrypt_ops = {
+	.needs_bounce_pages	= 1,
 	.get_context		= ceph_crypt_get_context,
 	.set_context		= ceph_crypt_set_context,
 	.get_dummy_policy	= ceph_get_dummy_policy,
@@ -211,6 +212,7 @@ void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
 static struct inode *parse_longname(const struct inode *parent,
 				    const char *name, int *name_len)
 {
+	struct ceph_client *cl = ceph_inode_to_client(parent);
 	struct inode *dir = NULL;
 	struct ceph_vino vino = { .snap = CEPH_NOSNAP };
 	char *inode_number;
@@ -222,12 +224,12 @@ static struct inode *parse_longname(const struct inode *parent,
 	name++;
 	name_end = strrchr(name, '_');
 	if (!name_end) {
-		dout("Failed to parse long snapshot name: %s\n", name);
+		doutc(cl, "failed to parse long snapshot name: %s\n", name);
 		return ERR_PTR(-EIO);
 	}
 	*name_len = (name_end - name);
 	if (*name_len <= 0) {
-		pr_err("Failed to parse long snapshot name\n");
+		pr_err_client(cl, "failed to parse long snapshot name\n");
 		return ERR_PTR(-EIO);
 	}
 
@@ -239,7 +241,7 @@ static struct inode *parse_longname(const struct inode *parent,
 		return ERR_PTR(-ENOMEM);
 	ret = kstrtou64(inode_number, 10, &vino.ino);
 	if (ret) {
-		dout("Failed to parse inode number: %s\n", name);
+		doutc(cl, "failed to parse inode number: %s\n", name);
 		dir = ERR_PTR(ret);
 		goto out;
 	}
@@ -250,7 +252,7 @@ static struct inode *parse_longname(const struct inode *parent,
 		/* This can happen if we're not mounting cephfs on the root */
 		dir = ceph_get_inode(parent->i_sb, vino, NULL);
 		if (IS_ERR(dir))
-			dout("Can't find inode %s (%s)\n", inode_number, name);
+			doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
 	}
 
 out:
@@ -261,6 +263,7 @@ out:
 int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
 				char *buf)
 {
+	struct ceph_client *cl = ceph_inode_to_client(parent);
 	struct inode *dir = parent;
 	struct qstr iname;
 	u32 len;
@@ -329,7 +332,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
 
 	/* base64 encode the encrypted name */
 	elen = ceph_base64_encode(cryptbuf, len, buf);
-	dout("base64-encoded ciphertext name = %.*s\n", elen, buf);
+	doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
 
 	/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
 	WARN_ON(elen > 240);
@@ -504,7 +507,10 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
 				  struct page *page, unsigned int len,
 				  unsigned int offs, u64 lblk_num)
 {
-	dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+
+	doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+	      ceph_vinop(inode), len, offs, lblk_num);
 	return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num);
 }
 
@@ -513,7 +519,10 @@ int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
 				  unsigned int offs, u64 lblk_num,
 				  gfp_t gfp_flags)
 {
-	dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
+
+	doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
+	      ceph_vinop(inode), len, offs, lblk_num);
 	return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
 					     gfp_flags);
 }
@@ -582,6 +591,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
 				 u64 off, struct ceph_sparse_extent *map,
 				 u32 ext_cnt)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int i, ret = 0;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 objno, objoff;
@@ -589,7 +599,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
 
 	/* Nothing to do for empty array */
 	if (ext_cnt == 0) {
-		dout("%s: empty array, ret 0\n", __func__);
+		doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode,
+		      ceph_vinop(inode));
 		return 0;
 	}
 
@@ -603,14 +614,17 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
 		int fret;
 
 		if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) {
-			pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n",
-				__func__, i, ext->off, ext->len);
+			pr_warn_client(cl,
+				"%p %llx.%llx bad encrypted sparse extent "
+				"idx %d off %llx len %llx\n",
+				inode, ceph_vinop(inode), i, ext->off,
+				ext->len);
 			return -EIO;
 		}
 		fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx],
 						 off + pgsoff, ext->len);
-		dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i,
-				ext->off, ext->len, fret);
+		doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode,
+		      ceph_vinop(inode), i, ext->off, ext->len, fret);
 		if (fret < 0) {
 			if (ret == 0)
 				ret = fret;
@@ -618,7 +632,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
 		}
 		ret = pgsoff + fret;
 	}
-	dout("%s: ret %d\n", __func__, ret);
+	doutc(cl, "ret %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3904333fa6c3..24c08078f5aa 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -81,7 +81,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 		if (req->r_inode) {
 			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
 		} else if (req->r_dentry) {
-			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+			path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
 						    &pathbase, 0);
 			if (IS_ERR(path))
 				path = NULL;
@@ -100,7 +100,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 		}
 
 		if (req->r_old_dentry) {
-			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+			path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
 						    &pathbase, 0);
 			if (IS_ERR(path))
 				path = NULL;
@@ -398,7 +398,7 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
 
 void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 {
-	dout("ceph_fs_debugfs_cleanup\n");
+	doutc(fsc->client, "begin\n");
 	debugfs_remove(fsc->debugfs_bdi);
 	debugfs_remove(fsc->debugfs_congestion_kb);
 	debugfs_remove(fsc->debugfs_mdsmap);
@@ -407,13 +407,14 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
 	debugfs_remove(fsc->debugfs_status);
 	debugfs_remove(fsc->debugfs_mdsc);
 	debugfs_remove_recursive(fsc->debugfs_metrics_dir);
+	doutc(fsc->client, "done\n");
 }
 
 void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
 	char name[100];
 
-	dout("ceph_fs_debugfs_init\n");
+	doutc(fsc->client, "begin\n");
 	fsc->debugfs_congestion_kb =
 		debugfs_create_file("writeback_congestion_kb",
 				    0600,
@@ -469,6 +470,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 			    &metrics_size_fops);
 	debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc,
 			    &metrics_caps_fops);
+	doutc(fsc->client, "done\n");
 }
 
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 854cbdd66661..91709934c8b1 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -109,7 +109,9 @@ static int fpos_cmp(loff_t l, loff_t r)
  * regardless of what dir changes take place on the
  * server.
  */
-static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
+static int note_last_dentry(struct ceph_fs_client *fsc,
+			    struct ceph_dir_file_info *dfi,
+			    const char *name,
 		            int len, unsigned next_offset)
 {
 	char *buf = kmalloc(len+1, GFP_KERNEL);
@@ -120,7 +122,7 @@ static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name,
 	memcpy(dfi->last_name, name, len);
 	dfi->last_name[len] = 0;
 	dfi->next_offset = next_offset;
-	dout("note_last_dentry '%s'\n", dfi->last_name);
+	doutc(fsc->client, "'%s'\n", dfi->last_name);
 	return 0;
 }
 
@@ -130,6 +132,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
 			struct ceph_readdir_cache_control *cache_ctl)
 {
 	struct inode *dir = d_inode(parent);
+	struct ceph_client *cl = ceph_inode_to_client(dir);
 	struct dentry *dentry;
 	unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
 	loff_t ptr_pos = idx * sizeof(struct dentry *);
@@ -142,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
 		ceph_readdir_cache_release(cache_ctl);
 		cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
 		if (!cache_ctl->page) {
-			dout(" page %lu not found\n", ptr_pgoff);
+			doutc(cl, " page %lu not found\n", ptr_pgoff);
 			return ERR_PTR(-EAGAIN);
 		}
 		/* reading/filling the cache are serialized by
@@ -185,13 +188,16 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	struct ceph_dir_file_info *dfi = file->private_data;
 	struct dentry *parent = file->f_path.dentry;
 	struct inode *dir = d_inode(parent);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir);
+	struct ceph_client *cl = ceph_inode_to_client(dir);
 	struct dentry *dentry, *last = NULL;
 	struct ceph_dentry_info *di;
 	struct ceph_readdir_cache_control cache_ctl = {};
 	u64 idx = 0;
 	int err = 0;
 
-	dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
+	doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir),
+	      (unsigned)shared_gen, ctx->pos);
 
 	/* search start position */
 	if (ctx->pos > 2) {
@@ -221,7 +227,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			dput(dentry);
 		}
 
-		dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
+		doutc(cl, "%p %llx.%llx cache idx %llu\n", dir,
+		      ceph_vinop(dir), idx);
 	}
 
 
@@ -257,8 +264,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		spin_unlock(&dentry->d_lock);
 
 		if (emit_dentry) {
-			dout(" %llx dentry %p %pd %p\n", di->offset,
-			     dentry, dentry, d_inode(dentry));
+			doutc(cl, " %llx dentry %p %pd %p\n", di->offset,
+			      dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
 				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
@@ -281,7 +288,8 @@ out:
 	if (last) {
 		int ret;
 		di = ceph_dentry(last);
-		ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len,
+		ret = note_last_dentry(fsc, dfi, last->d_name.name,
+				       last->d_name.len,
 				       fpos_off(di->offset) + 1);
 		if (ret < 0)
 			err = ret;
@@ -310,20 +318,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	struct ceph_dir_file_info *dfi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_client *cl = fsc->client;
 	int i;
 	int err;
 	unsigned frag = -1;
 	struct ceph_mds_reply_info_parsed *rinfo;
 
-	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
+	doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode,
+	      ceph_vinop(inode), file, ctx->pos);
 	if (dfi->file_info.flags & CEPH_F_ATEND)
 		return 0;
 
 	/* always start with . and .. */
 	if (ctx->pos == 0) {
-		dout("readdir off 0 -> '.'\n");
+		doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode,
+		      ceph_vinop(inode));
 		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
@@ -337,7 +348,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		ino = ceph_present_inode(dentry->d_parent->d_inode);
 		spin_unlock(&dentry->d_lock);
 
-		dout("readdir off 1 -> '..'\n");
+		doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode,
+		      ceph_vinop(inode));
 		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
@@ -391,8 +403,8 @@ more:
 			frag = fpos_frag(ctx->pos);
 		}
 
-		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
-		     ceph_vinop(inode), frag, dfi->last_name);
+		doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n",
+		      inode, ceph_vinop(inode), frag, dfi->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
@@ -446,12 +458,12 @@ more:
 			ceph_mdsc_put_request(req);
 			return err;
 		}
-		dout("readdir got and parsed readdir result=%d on "
-		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
-		     err, frag,
-		     (int)req->r_reply_info.dir_end,
-		     (int)req->r_reply_info.dir_complete,
-		     (int)req->r_reply_info.hash_order);
+		doutc(cl, "%p %llx.%llx got and parsed readdir result=%d"
+		      "on frag %x, end=%d, complete=%d, hash_order=%d\n",
+		      inode, ceph_vinop(inode), err, frag,
+		      (int)req->r_reply_info.dir_end,
+		      (int)req->r_reply_info.dir_complete,
+		      (int)req->r_reply_info.hash_order);
 
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -481,7 +493,8 @@ more:
 				dfi->dir_ordered_count = req->r_dir_ordered_cnt;
 			}
 		} else {
-			dout("readdir !did_prepopulate\n");
+			doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode,
+			      ceph_vinop(inode));
 			/* disable readdir cache */
 			dfi->readdir_cache_idx = -1;
 			/* preclude from marking dir complete */
@@ -494,8 +507,8 @@ more:
 					rinfo->dir_entries + (rinfo->dir_nr-1);
 			unsigned next_offset = req->r_reply_info.dir_end ?
 					2 : (fpos_off(rde->offset) + 1);
-			err = note_last_dentry(dfi, rde->name, rde->name_len,
-					       next_offset);
+			err = note_last_dentry(fsc, dfi, rde->name,
+					       rde->name_len, next_offset);
 			if (err) {
 				ceph_mdsc_put_request(dfi->last_readdir);
 				dfi->last_readdir = NULL;
@@ -508,9 +521,9 @@ more:
 	}
 
 	rinfo = &dfi->last_readdir->r_reply_info;
-	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-	     dfi->frag, rinfo->dir_nr, ctx->pos,
-	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
+	doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n",
+	      inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos,
+	      rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
 	i = 0;
 	/* search start position */
@@ -530,8 +543,9 @@ more:
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
 
 		if (rde->offset < ctx->pos) {
-			pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
-				__func__, rde->offset, ctx->pos);
+			pr_warn_client(cl,
+				"%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n",
+				inode, ceph_vinop(inode), rde->offset, ctx->pos);
 			return -EIO;
 		}
 
@@ -539,9 +553,9 @@ more:
 			return -EIO;
 
 		ctx->pos = rde->offset;
-		dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
-		     i, rinfo->dir_nr, ctx->pos,
-		     rde->name_len, rde->name, &rde->inode.in);
+		doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode,
+		      ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos,
+		      rde->name_len, rde->name, &rde->inode.in);
 
 		if (!dir_emit(ctx, rde->name, rde->name_len,
 			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
@@ -552,7 +566,7 @@ more:
 			 * doesn't have enough memory, etc. So for next readdir
 			 * it will continue.
 			 */
-			dout("filldir stopping us...\n");
+			doutc(cl, "filldir stopping us...\n");
 			return 0;
 		}
 
@@ -583,7 +597,8 @@ more:
 			kfree(dfi->last_name);
 			dfi->last_name = NULL;
 		}
-		dout("readdir next frag is %x\n", frag);
+		doutc(cl, "%p %llx.%llx next frag is %x\n", inode,
+		      ceph_vinop(inode), frag);
 		goto more;
 	}
 	dfi->file_info.flags |= CEPH_F_ATEND;
@@ -598,20 +613,23 @@ more:
 		spin_lock(&ci->i_ceph_lock);
 		if (dfi->dir_ordered_count ==
 				atomic64_read(&ci->i_ordered_count)) {
-			dout(" marking %p complete and ordered\n", inode);
+			doutc(cl, " marking %p %llx.%llx complete and ordered\n",
+			      inode, ceph_vinop(inode));
 			/* use i_size to track number of entries in
 			 * readdir cache */
 			BUG_ON(dfi->readdir_cache_idx < 0);
 			i_size_write(inode, dfi->readdir_cache_idx *
 				     sizeof(struct dentry*));
 		} else {
-			dout(" marking %p complete\n", inode);
+			doutc(cl, " marking %llx.%llx complete\n",
+			      ceph_vinop(inode));
 		}
 		__ceph_dir_set_complete(ci, dfi->dir_release_count,
 					dfi->dir_ordered_count);
 		spin_unlock(&ci->i_ceph_lock);
 	}
-	dout("readdir %p file %p done.\n", inode, file);
+	doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode),
+	      file);
 	return 0;
 }
 
@@ -657,6 +675,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_dir_file_info *dfi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	loff_t retval;
 
 	inode_lock(inode);
@@ -676,7 +695,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 
 	if (offset >= 0) {
 		if (need_reset_readdir(dfi, offset)) {
-			dout("dir_llseek dropping %p content\n", file);
+			doutc(cl, "%p %llx.%llx dropping %p content\n",
+			      inode, ceph_vinop(inode), file);
 			reset_readdir(dfi);
 		} else if (is_hash_order(offset) && offset > file->f_pos) {
 			/* for hash offset, we don't know if a forward seek
@@ -703,8 +723,9 @@ out:
 struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
 				   struct dentry *dentry)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
 	struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
+	struct ceph_client *cl = ceph_inode_to_client(parent);
 
 	/* .snap dir? */
 	if (ceph_snap(parent) == CEPH_NOSNAP &&
@@ -713,8 +734,9 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
 		struct inode *inode = ceph_get_snapdir(parent);
 
 		res = d_splice_alias(inode, dentry);
-		dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
-		     dentry, dentry, inode, res);
+		doutc(cl, "ENOENT on snapdir %p '%pd', linking to "
+		      "snapdir %p %llx.%llx. Spliced dentry %p\n",
+		      dentry, dentry, inode, ceph_vinop(inode), res);
 		if (res)
 			dentry = res;
 	}
@@ -735,12 +757,15 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 				  struct dentry *dentry, int err)
 {
+	struct ceph_client *cl = req->r_mdsc->fsc->client;
+
 	if (err == -ENOENT) {
 		/* no trace? */
 		err = 0;
 		if (!req->r_reply_info.head->is_dentry) {
-			dout("ENOENT and no trace, dentry %p inode %p\n",
-			     dentry, d_inode(dentry));
+			doutc(cl,
+			      "ENOENT and no trace, dentry %p inode %llx.%llx\n",
+			      dentry, ceph_vinop(d_inode(dentry)));
 			if (d_really_is_positive(dentry)) {
 				d_drop(dentry);
 				err = -ENOENT;
@@ -771,15 +796,16 @@ static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 				  unsigned int flags)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_request *req;
 	int op;
 	int mask;
 	int err;
 
-	dout("lookup %p dentry %p '%pd'\n",
-	     dir, dentry, dentry);
+	doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir),
+	      dentry, dentry);
 
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -802,7 +828,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		struct ceph_dentry_info *di = ceph_dentry(dentry);
 
 		spin_lock(&ci->i_ceph_lock);
-		dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
+		doutc(cl, " dir %llx.%llx flags are 0x%lx\n",
+		      ceph_vinop(dir), ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
 			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
@@ -812,7 +839,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		    __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) {
 			__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
 			spin_unlock(&ci->i_ceph_lock);
-			dout(" dir %p complete, -ENOENT\n", dir);
+			doutc(cl, " dir %llx.%llx complete, -ENOENT\n",
+			      ceph_vinop(dir));
 			d_add(dentry, NULL);
 			di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
 			return NULL;
@@ -850,7 +878,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 	}
 	dentry = ceph_finish_lookup(req, dentry, err);
 	ceph_mdsc_put_request(req);  /* will dput(dentry) */
-	dout("lookup result=%p\n", dentry);
+	doutc(cl, "result=%p\n", dentry);
 	return dentry;
 }
 
@@ -885,6 +913,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		      struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
@@ -901,8 +930,8 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		goto out;
 	}
 
-	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
-	     dir, dentry, mode, rdev);
+	doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n",
+	      dir, ceph_vinop(dir), dentry, dentry, mode, rdev);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -924,6 +953,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	req->r_parent = dir;
 	ihold(dir);
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+	req->r_mnt_idmap = mnt_idmap_get(idmap);
 	req->r_args.mknod.mode = cpu_to_le32(mode);
 	req->r_args.mknod.rdev = cpu_to_le32(rdev);
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -993,6 +1023,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
 			struct dentry *dentry, const char *dest)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	umode_t mode = S_IFLNK | 0777;
@@ -1010,7 +1041,8 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		goto out;
 	}
 
-	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry,
+	      dest);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -1040,6 +1072,7 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	}
 
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+	req->r_mnt_idmap = mnt_idmap_get(idmap);
 	req->r_dentry = dget(dentry);
 	req->r_num_caps = 2;
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
@@ -1064,6 +1097,7 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		      struct dentry *dentry, umode_t mode)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
@@ -1076,10 +1110,11 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (ceph_snap(dir) == CEPH_SNAPDIR) {
 		/* mkdir .snap/foo is a MKSNAP */
 		op = CEPH_MDS_OP_MKSNAP;
-		dout("mksnap dir %p snap '%pd' dn %p\n", dir,
-		     dentry, dentry);
+		doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n",
+		      ceph_vinop(dir), dentry, dentry);
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
-		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
+		doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n",
+		      ceph_vinop(dir), dentry, dentry, mode);
 		op = CEPH_MDS_OP_MKDIR;
 	} else {
 		err = -EROFS;
@@ -1117,6 +1152,8 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	req->r_parent = dir;
 	ihold(dir);
 	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+	if (op == CEPH_MDS_OP_MKDIR)
+		req->r_mnt_idmap = mnt_idmap_get(idmap);
 	req->r_args.mkdir.mode = cpu_to_le32(mode);
 	req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
 			     CEPH_CAP_XATTR_EXCL;
@@ -1144,6 +1181,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 		     struct dentry *dentry)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	int err;
 
@@ -1161,8 +1199,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		return err;
 
-	dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
-	     dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
+	doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir),
+	      old_dentry, dentry);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		d_drop(dentry);
@@ -1199,14 +1237,16 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
 	struct dentry *dentry = req->r_dentry;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	int result = req->r_err ? req->r_err :
 			le32_to_cpu(req->r_reply_info.head->result);
 
 	if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
-		pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
-			__func__, dentry, dentry);
+		pr_warn_client(cl,
+			"dentry %p:%pd async unlink bit is not set\n",
+			dentry, dentry);
 
 	spin_lock(&fsc->async_unlink_conflict_lock);
 	hash_del_rcu(&di->hnode);
@@ -1226,7 +1266,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 	if (result) {
 		int pathlen = 0;
 		u64 base = 0;
-		char *path = ceph_mdsc_build_path(dentry, &pathlen,
+		char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
 						  &base, 0);
 
 		/* mark error on parent + clear complete */
@@ -1240,8 +1280,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 		/* mark inode itself for an error (since metadata is bogus) */
 		mapping_set_error(req->r_old_inode->i_mapping, result);
 
-		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
-			base, IS_ERR(path) ? "<<bad>>" : path, result);
+		pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
+			       base, IS_ERR(path) ? "<<bad>>" : path, result);
 		ceph_mdsc_free_path(path, pathlen);
 	}
 out:
@@ -1290,7 +1330,8 @@ static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
  */
 static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = d_inode(dentry);
 	struct ceph_mds_request *req;
@@ -1300,11 +1341,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (ceph_snap(dir) == CEPH_SNAPDIR) {
 		/* rmdir .snap/foo is RMSNAP */
-		dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
+		doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir),
+		      dentry);
 		op = CEPH_MDS_OP_RMSNAP;
 	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
-		dout("unlink/rmdir dir %p dn %p inode %p\n",
-		     dir, dentry, inode);
+		doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n",
+		      ceph_vinop(dir), dentry, ceph_vinop(inode));
 		op = d_is_dir(dentry) ?
 			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
 	} else
@@ -1327,9 +1369,9 @@ retry:
 	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
 		struct ceph_dentry_info *di = ceph_dentry(dentry);
 
-		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
-		     dentry->d_name.len, dentry->d_name.name,
-		     ceph_cap_string(req->r_dir_caps));
+		doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s",
+		      ceph_vinop(dir), dentry,
+		      ceph_cap_string(req->r_dir_caps));
 		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
 		req->r_callback = ceph_async_unlink_cb;
 		req->r_old_inode = d_inode(dentry);
@@ -1384,6 +1426,7 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		       struct dentry *new_dentry, unsigned int flags)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	int op = CEPH_MDS_OP_RENAME;
 	int err;
@@ -1413,8 +1456,9 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	if (err)
 		return err;
 
-	dout("rename dir %p dentry %p to dir %p dentry %p\n",
-	     old_dir, old_dentry, new_dir, new_dentry);
+	doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n",
+	      ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir),
+	      new_dentry);
 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -1459,9 +1503,10 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
 {
 	struct dentry *dn = di->dentry;
-	struct ceph_mds_client *mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 
-	dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
+	doutc(cl, "%p %p '%pd'\n", di, dn, dn);
 
 	di->flags |= CEPH_DENTRY_LEASE_LIST;
 	if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
@@ -1469,7 +1514,6 @@ void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
 		return;
 	}
 
-	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 	spin_lock(&mdsc->dentry_list_lock);
 	list_move_tail(&di->lease_list, &mdsc->dentry_leases);
 	spin_unlock(&mdsc->dentry_list_lock);
@@ -1493,10 +1537,10 @@ static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
 void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
 {
 	struct dentry *dn = di->dentry;
-	struct ceph_mds_client *mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 
-	dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
-	     di, dn, dn, di->offset);
+	doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset);
 
 	if (!list_empty(&di->lease_list)) {
 		if (di->flags & CEPH_DENTRY_LEASE_LIST) {
@@ -1516,7 +1560,6 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
 		return;
 	}
 
-	mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
 	spin_lock(&mdsc->dentry_list_lock);
 	__dentry_dir_lease_touch(mdsc, di),
 	spin_unlock(&mdsc->dentry_list_lock);
@@ -1530,7 +1573,7 @@ static void __dentry_lease_unlist(struct ceph_dentry_info *di)
 	if (list_empty(&di->lease_list))
 		return;
 
-	mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc;
+	mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc;
 	spin_lock(&mdsc->dentry_list_lock);
 	list_del_init(&di->lease_list);
 	spin_unlock(&mdsc->dentry_list_lock);
@@ -1757,6 +1800,8 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
 {
 	struct ceph_dentry_info *di;
 	struct ceph_mds_session *session = NULL;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	u32 seq = 0;
 	int valid = 0;
 
@@ -1789,7 +1834,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags)
 					 CEPH_MDS_LEASE_RENEW, seq);
 		ceph_put_mds_session(session);
 	}
-	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	doutc(cl, "dentry %p = %d\n", dentry, valid);
 	return valid;
 }
 
@@ -1832,6 +1877,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
 			      struct ceph_mds_client *mdsc)
 {
 	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_client *cl = mdsc->fsc->client;
 	int valid;
 	int shared_gen;
 
@@ -1853,8 +1899,9 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
 			valid = 0;
 		spin_unlock(&dentry->d_lock);
 	}
-	dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
-	     dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
+	doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir,
+	      ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen),
+	      dentry, dentry, valid);
 	return valid;
 }
 
@@ -1863,10 +1910,11 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
  */
 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	int valid = 0;
 	struct dentry *parent;
 	struct inode *dir, *inode;
-	struct ceph_mds_client *mdsc;
 
 	valid = fscrypt_d_revalidate(dentry, flags);
 	if (valid <= 0)
@@ -1884,16 +1932,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 		inode = d_inode(dentry);
 	}
 
-	dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
-	     dentry, inode, ceph_dentry(dentry)->offset,
-	     !!(dentry->d_flags & DCACHE_NOKEY_NAME));
+	doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n",
+	      dentry, dentry, inode, ceph_dentry(dentry)->offset,
+	      !!(dentry->d_flags & DCACHE_NOKEY_NAME));
 
-	mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+	mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc;
 
 	/* always trust cached snapped dentries, snapdir dentry */
 	if (ceph_snap(dir) != CEPH_NOSNAP) {
-		dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
-		     dentry, inode);
+		doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry,
+		      dentry, inode);
 		valid = 1;
 	} else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
 		valid = 1;
@@ -1948,14 +1996,14 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 				break;
 			}
 			ceph_mdsc_put_request(req);
-			dout("d_revalidate %p lookup result=%d\n",
-			     dentry, err);
+			doutc(cl, "%p '%pd', lookup result=%d\n", dentry,
+			      dentry, err);
 		}
 	} else {
 		percpu_counter_inc(&mdsc->metric.d_lease_hit);
 	}
 
-	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
+	doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid");
 	if (!valid)
 		ceph_dir_clear_complete(dir);
 
@@ -1995,9 +2043,9 @@ static int ceph_d_delete(const struct dentry *dentry)
 static void ceph_d_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
 
-	dout("d_release %p\n", dentry);
+	doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry);
 
 	atomic64_dec(&fsc->mdsc->metric.total_dentries);
 
@@ -2018,10 +2066,12 @@ static void ceph_d_release(struct dentry *dentry)
  */
 static void ceph_d_prune(struct dentry *dentry)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *dir_ci;
 	struct ceph_dentry_info *di;
 
-	dout("ceph_d_prune %pd %p\n", dentry, dentry);
+	doutc(cl, "dentry %p '%pd'\n", dentry, dentry);
 
 	/* do we have a valid parent? */
 	if (IS_ROOT(dentry))
@@ -2064,7 +2114,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	int left;
 	const int bufsize = 1024;
 
-	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+	if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
 	if (!dfi->dir_info) {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8559990a59a5..726af69d4d62 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -36,6 +36,7 @@ struct ceph_nfs_snapfh {
 static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
 			      struct inode *parent_inode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	static const int snap_handle_length =
 		sizeof(struct ceph_nfs_snapfh) >> 2;
 	struct ceph_nfs_snapfh *sfh = (void *)rawfh;
@@ -79,13 +80,14 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
 	*max_len = snap_handle_length;
 	ret = FILEID_BTRFS_WITH_PARENT;
 out:
-	dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
+	doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret);
 	return ret;
 }
 
 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 			  struct inode *parent_inode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	static const int handle_length =
 		sizeof(struct ceph_nfs_fh) >> 2;
 	static const int connected_handle_length =
@@ -105,15 +107,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 
 	if (parent_inode) {
 		struct ceph_nfs_confh *cfh = (void *)rawfh;
-		dout("encode_fh %llx with parent %llx\n",
-		     ceph_ino(inode), ceph_ino(parent_inode));
+		doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode,
+		      ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode));
 		cfh->ino = ceph_ino(inode);
 		cfh->parent_ino = ceph_ino(parent_inode);
 		*max_len = connected_handle_length;
 		type = FILEID_INO32_GEN_PARENT;
 	} else {
 		struct ceph_nfs_fh *fh = (void *)rawfh;
-		dout("encode_fh %llx\n", ceph_ino(inode));
+		doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 		fh->ino = ceph_ino(inode);
 		*max_len = handle_length;
 		type = FILEID_INO32_GEN;
@@ -123,7 +125,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 
 static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
 	struct inode *inode;
 	struct ceph_vino vino;
 	int err;
@@ -205,7 +207,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
 					  struct ceph_nfs_snapfh *sfh,
 					  bool want_parent)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct inode *inode;
 	struct ceph_vino vino;
@@ -278,11 +281,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
 	ceph_mdsc_put_request(req);
 
 	if (want_parent) {
-		dout("snapfh_to_parent %llx.%llx\n err=%d\n",
-		     vino.ino, vino.snap, err);
+		doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err);
 	} else {
-		dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
-		      vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
+		doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
+		      vino.snap, sfh->parent_ino, sfh->hash, err);
 	}
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -297,6 +299,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
 					struct fid *fid,
 					int fh_len, int fh_type)
 {
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_nfs_fh *fh = (void *)fid->raw;
 
 	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
@@ -310,14 +313,14 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
 	if (fh_len < sizeof(*fh) / 4)
 		return NULL;
 
-	dout("fh_to_dentry %llx\n", fh->ino);
+	doutc(fsc->client, "%llx\n", fh->ino);
 	return __fh_to_dentry(sb, fh->ino);
 }
 
 static struct dentry *__get_parent(struct super_block *sb,
 				   struct dentry *child, u64 ino)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct inode *inode;
 	int mask;
@@ -363,6 +366,7 @@ static struct dentry *__get_parent(struct super_block *sb,
 static struct dentry *ceph_get_parent(struct dentry *child)
 {
 	struct inode *inode = d_inode(child);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct dentry *dn;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -402,8 +406,8 @@ static struct dentry *ceph_get_parent(struct dentry *child)
 		dn = __get_parent(child->d_sb, child, 0);
 	}
 out:
-	dout("get_parent %p ino %llx.%llx err=%ld\n",
-	     child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
+	doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode,
+	      ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn));
 	return dn;
 }
 
@@ -414,6 +418,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
 					struct fid *fid,
 					int fh_len, int fh_type)
 {
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_nfs_confh *cfh = (void *)fid->raw;
 	struct dentry *dentry;
 
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
 	if (fh_len < sizeof(*cfh) / 4)
 		return NULL;
 
-	dout("fh_to_parent %llx\n", cfh->parent_ino);
+	doutc(fsc->client, "%llx\n", cfh->parent_ino);
 	dentry = __get_parent(sb, NULL, cfh->ino);
 	if (unlikely(dentry == ERR_PTR(-ENOENT)))
 		dentry = __fh_to_dentry(sb, cfh->parent_ino);
@@ -439,7 +444,7 @@ static int __get_snap_name(struct dentry *parent, char *name,
 {
 	struct inode *inode = d_inode(child);
 	struct inode *dir = d_inode(parent);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	struct ceph_mds_request *req = NULL;
 	char *last_name = NULL;
 	unsigned next_offset = 2;
@@ -526,8 +531,8 @@ out:
 	if (req)
 		ceph_mdsc_put_request(req);
 	kfree(last_name);
-	dout("get_snap_name %p ino %llx.%llx err=%d\n",
-	     child, ceph_vinop(inode), err);
+	doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child,
+	      inode, ceph_vinop(inode), err);
 	return err;
 }
 
@@ -544,7 +549,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return __get_snap_name(parent, name, child);
 
-	mdsc = ceph_inode_to_client(inode)->mdsc;
+	mdsc = ceph_inode_to_fs_client(inode)->mdsc;
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
 				       USE_ANY_MDS);
 	if (IS_ERR(req))
@@ -588,9 +593,9 @@ static int ceph_get_name(struct dentry *parent, char *name,
 		ceph_fname_free_buffer(dir, &oname);
 	}
 out:
-	dout("get_name %p ino %llx.%llx err %d %s%s\n",
-		     child, ceph_vinop(inode), err,
-		     err ? "" : "name ", err ? "" : name);
+	doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n",
+	      child, inode, ceph_vinop(inode), err, err ? "" : "name ",
+	      err ? "" : name);
 	ceph_mdsc_put_request(req);
 	return err;
 }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b5f8038065d7..3b5aae29e944 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -19,8 +19,9 @@
 #include "io.h"
 #include "metric.h"
 
-static __le32 ceph_flags_sys2wire(u32 flags)
+static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	u32 wire_flags = 0;
 
 	switch (flags & O_ACCMODE) {
@@ -48,7 +49,7 @@ static __le32 ceph_flags_sys2wire(u32 flags)
 #undef ceph_sys2wire
 
 	if (flags)
-		dout("unused open flags: %x\n", flags);
+		doutc(cl, "unused open flags: %x\n", flags);
 
 	return cpu_to_le32(wire_flags);
 }
@@ -189,7 +190,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	if (IS_ERR(req))
 		goto out;
 	req->r_fmode = ceph_flags_to_mode(flags);
-	req->r_args.open.flags = ceph_flags_sys2wire(flags);
+	req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
 out:
 	return req;
@@ -200,12 +201,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mount_options *opt =
-		ceph_inode_to_client(&ci->netfs.inode)->mount_options;
+		ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_file_info *fi;
 	int ret;
 
-	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
-			inode->i_mode, isdir ? "dir" : "regular");
+	doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode),
+	      file, inode->i_mode, isdir ? "dir" : "regular");
 	BUG_ON(inode->i_fop->release != ceph_release);
 
 	if (isdir) {
@@ -234,7 +236,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
 
 	spin_lock_init(&fi->rw_contexts_lock);
 	INIT_LIST_HEAD(&fi->rw_contexts);
-	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+	fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen);
 
 	if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
 		ret = ceph_uninline_data(file);
@@ -259,6 +261,7 @@ error:
  */
 static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int ret = 0;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -271,13 +274,13 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 		break;
 
 	case S_IFLNK:
-		dout("init_file %p %p 0%o (symlink)\n", inode, file,
-		     inode->i_mode);
+		doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode,
+		      ceph_vinop(inode), file, inode->i_mode);
 		break;
 
 	default:
-		dout("init_file %p %p 0%o (special)\n", inode, file,
-		     inode->i_mode);
+		doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode,
+		      ceph_vinop(inode), file, inode->i_mode);
 		/*
 		 * we need to drop the open ref now, since we don't
 		 * have .release set to ceph_release.
@@ -296,6 +299,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 int ceph_renew_caps(struct inode *inode, int fmode)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	int err, flags, wanted;
@@ -307,8 +311,9 @@ int ceph_renew_caps(struct inode *inode, int fmode)
 	    (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
 		int issued = __ceph_caps_issued(ci, NULL);
 		spin_unlock(&ci->i_ceph_lock);
-		dout("renew caps %p want %s issued %s updating mds_wanted\n",
-		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+		doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n",
+		      inode, ceph_vinop(inode), ceph_cap_string(wanted),
+		      ceph_cap_string(issued));
 		ceph_check_caps(ci, 0);
 		return 0;
 	}
@@ -339,7 +344,8 @@ int ceph_renew_caps(struct inode *inode, int fmode)
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
 out:
-	dout("renew caps %p open result=%d\n", inode, err);
+	doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode),
+	      err);
 	return err < 0 ? err : 0;
 }
 
@@ -352,7 +358,8 @@ out:
 int ceph_open(struct inode *inode, struct file *file)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_file_info *fi = file->private_data;
@@ -360,7 +367,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	int flags, fmode, wanted;
 
 	if (fi) {
-		dout("open file %p is already opened\n", file);
+		doutc(cl, "file %p is already opened\n", file);
 		return 0;
 	}
 
@@ -374,8 +381,8 @@ int ceph_open(struct inode *inode, struct file *file)
 			return err;
 	}
 
-	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
-	     ceph_vinop(inode), file, flags, file->f_flags);
+	doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode,
+	      ceph_vinop(inode), file, flags, file->f_flags);
 	fmode = ceph_flags_to_mode(flags);
 	wanted = ceph_caps_for_mode(fmode);
 
@@ -399,9 +406,9 @@ int ceph_open(struct inode *inode, struct file *file)
 		int mds_wanted = __ceph_caps_mds_wanted(ci, true);
 		int issued = __ceph_caps_issued(ci, NULL);
 
-		dout("open %p fmode %d want %s issued %s using existing\n",
-		     inode, fmode, ceph_cap_string(wanted),
-		     ceph_cap_string(issued));
+		doutc(cl, "open %p fmode %d want %s issued %s using existing\n",
+		      inode, fmode, ceph_cap_string(wanted),
+		      ceph_cap_string(issued));
 		__ceph_touch_fmode(ci, mdsc, fmode);
 		spin_unlock(&ci->i_ceph_lock);
 
@@ -421,7 +428,7 @@ int ceph_open(struct inode *inode, struct file *file)
 
 	spin_unlock(&ci->i_ceph_lock);
 
-	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
 	req = prepare_open_request(inode->i_sb, flags, 0);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
@@ -435,7 +442,7 @@ int ceph_open(struct inode *inode, struct file *file)
 	if (!err)
 		err = ceph_init_file(inode, file, req->r_fmode);
 	ceph_mdsc_put_request(req);
-	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+	doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
 out:
 	return err;
 }
@@ -515,6 +522,7 @@ no_async:
 
 static void restore_deleg_ino(struct inode *dir, u64 ino)
 {
+	struct ceph_client *cl = ceph_inode_to_client(dir);
 	struct ceph_inode_info *ci = ceph_inode(dir);
 	struct ceph_mds_session *s = NULL;
 
@@ -525,7 +533,8 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
 	if (s) {
 		int err = ceph_restore_deleg_ino(s, ino);
 		if (err)
-			pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+			pr_warn_client(cl,
+				"unable to restore delegated ino 0x%llx to session: %d\n",
 				ino, err);
 		ceph_put_mds_session(s);
 	}
@@ -557,6 +566,7 @@ static void wake_async_create_waiters(struct inode *inode,
 static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
                                  struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct dentry *dentry = req->r_dentry;
 	struct inode *dinode = d_inode(dentry);
 	struct inode *tinode = req->r_target_inode;
@@ -574,10 +584,11 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 	if (result) {
 		int pathlen = 0;
 		u64 base = 0;
-		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+		char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
 						  &base, 0);
 
-		pr_warn("async create failure path=(%llx)%s result=%d!\n",
+		pr_warn_client(cl,
+			"async create failure path=(%llx)%s result=%d!\n",
 			base, IS_ERR(path) ? "<<bad>>" : path, result);
 		ceph_mdsc_free_path(path, pathlen);
 
@@ -596,14 +607,15 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 		u64 ino = ceph_vino(tinode).ino;
 
 		if (req->r_deleg_ino != ino)
-			pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
-				__func__, req->r_err, req->r_deleg_ino, ino);
+			pr_warn_client(cl,
+				"inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+				req->r_err, req->r_deleg_ino, ino);
 
 		mapping_set_error(tinode->i_mapping, result);
 		wake_async_create_waiters(tinode, req->r_session);
 	} else if (!result) {
-		pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
-			req->r_deleg_ino);
+		pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n",
+			       req->r_deleg_ino);
 	}
 out:
 	ceph_mdsc_release_dir_caps(req);
@@ -625,6 +637,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 	struct timespec64 now;
 	struct ceph_string *pool_ns;
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_vino vino = { .ino = req->r_deleg_ino,
 				  .snap = CEPH_NOSNAP };
 
@@ -655,7 +668,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 	in.truncate_seq = cpu_to_le32(1);
 	in.truncate_size = cpu_to_le64(-1ULL);
 	in.xattr_version = cpu_to_le64(1);
-	in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+	in.uid = cpu_to_le32(from_kuid(&init_user_ns,
+				       mapped_fsuid(req->r_mnt_idmap,
+						    &init_user_ns)));
 	if (dir->i_mode & S_ISGID) {
 		in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
 
@@ -663,7 +678,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else {
-		in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
+		in.gid = cpu_to_le32(from_kgid(&init_user_ns,
+				     mapped_fsgid(req->r_mnt_idmap,
+						  &init_user_ns)));
 	}
 	in.mode = cpu_to_le32((u32)mode);
 
@@ -683,7 +700,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 			      req->r_fmode, NULL);
 	up_read(&mdsc->snap_rwsem);
 	if (ret) {
-		dout("%s failed to fill inode: %d\n", __func__, ret);
+		doutc(cl, "failed to fill inode: %d\n", ret);
 		ceph_dir_clear_complete(dir);
 		if (!d_unhashed(dentry))
 			d_drop(dentry);
@@ -691,8 +708,8 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 	} else {
 		struct dentry *dn;
 
-		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
-			vino.ino, ceph_ino(dir), dentry->d_name.name);
+		doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n",
+		      vino.ino, ceph_ino(dir), dentry->d_name.name);
 		ceph_dir_clear_ordered(dir);
 		ceph_init_inode_acls(inode, as_ctx);
 		if (inode->i_state & I_NEW) {
@@ -730,7 +747,9 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
 int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		     struct file *file, unsigned flags, umode_t mode)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct inode *new_inode = NULL;
@@ -740,9 +759,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	int mask;
 	int err;
 
-	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
-	     dir, dentry, dentry,
-	     d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+	doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
+	      dir, ceph_vinop(dir), dentry, dentry,
+	      d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
 
 	if (dentry->d_name.len > NAME_MAX)
 		return -ENAMETOOLONG;
@@ -788,6 +807,8 @@ retry:
 		mask |= CEPH_CAP_XATTR_SHARED;
 	req->r_args.open.mask = cpu_to_le32(mask);
 	req->r_parent = dir;
+	if (req->r_op == CEPH_MDS_OP_CREATE)
+		req->r_mnt_idmap = mnt_idmap_get(idmap);
 	ihold(dir);
 	if (IS_ENCRYPTED(dir)) {
 		set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
@@ -880,17 +901,18 @@ retry:
 		goto out_req;
 	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
 		/* make vfs retry on splice, ENOENT, or symlink */
-		dout("atomic_open finish_no_open on dn %p\n", dn);
+		doutc(cl, "finish_no_open on dn %p\n", dn);
 		err = finish_no_open(file, dn);
 	} else {
 		if (IS_ENCRYPTED(dir) &&
 		    !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
-			pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
+			pr_warn_client(cl,
+				"Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
 				ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
 			goto out_req;
 		}
 
-		dout("atomic_open finish_open on dn %p\n", dn);
+		doutc(cl, "finish_open on dn %p\n", dn);
 		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
 			struct inode *newino = d_inode(dentry);
 
@@ -905,17 +927,19 @@ out_req:
 	iput(new_inode);
 out_ctx:
 	ceph_release_acl_sec_ctx(&as_ctx);
-	dout("atomic_open result=%d\n", err);
+	doutc(cl, "result=%d\n", err);
 	return err;
 }
 
 int ceph_release(struct inode *inode, struct file *file)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	if (S_ISDIR(inode->i_mode)) {
 		struct ceph_dir_file_info *dfi = file->private_data;
-		dout("release inode %p dir file %p\n", inode, file);
+		doutc(cl, "%p %llx.%llx dir file %p\n", inode,
+		      ceph_vinop(inode), file);
 		WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
 
 		ceph_put_fmode(ci, dfi->file_info.fmode, 1);
@@ -927,7 +951,8 @@ int ceph_release(struct inode *inode, struct file *file)
 		kmem_cache_free(ceph_dir_file_cachep, dfi);
 	} else {
 		struct ceph_file_info *fi = file->private_data;
-		dout("release inode %p regular file %p\n", inode, file);
+		doutc(cl, "%p %llx.%llx regular file %p\n", inode,
+		      ceph_vinop(inode), file);
 		WARN_ON(!list_empty(&fi->rw_contexts));
 
 		ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
@@ -962,7 +987,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			 u64 *last_objver)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	ssize_t ret;
 	u64 off = *ki_pos;
@@ -971,7 +997,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 	bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
 	u64 objver = 0;
 
-	dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
+	doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode,
+	      ceph_vinop(inode), *ki_pos, len);
 
 	if (ceph_inode_is_shutdown(inode))
 		return -EIO;
@@ -1005,8 +1032,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		/* determine new offset/length if encrypted */
 		ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
 
-		dout("sync_read orig %llu~%llu reading %llu~%llu",
-		     off, len, read_off, read_len);
+		doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len,
+		      read_off, read_len);
 
 		req = ceph_osdc_new_request(osdc, &ci->i_layout,
 					ci->i_vino, read_off, &read_len, 0, 1,
@@ -1059,8 +1086,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			objver = req->r_version;
 
 		i_size = i_size_read(inode);
-		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
-		     off, len, ret, i_size, (more ? " MORE" : ""));
+		doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len,
+		      ret, i_size, (more ? " MORE" : ""));
 
 		/* Fix it to go to end of extent map */
 		if (sparse && ret >= 0)
@@ -1101,8 +1128,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 			int zlen = min(len - ret, i_size - off - ret);
 			int zoff = page_off + ret;
 
-			dout("sync_read zero gap %llu~%llu\n",
-				off + ret, off + ret + zlen);
+			doutc(cl, "zero gap %llu~%llu\n", off + ret,
+			      off + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
 			ret += zlen;
 		}
@@ -1151,7 +1178,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		if (last_objver)
 			*last_objver = objver;
 	}
-	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
+	doutc(cl, "result %zd retry_op %d\n", ret, *retry_op);
 	return ret;
 }
 
@@ -1160,9 +1187,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 
-	dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos,
-	     iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+	doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos,
+	      iov_iter_count(to),
+	      (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 
 	return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
 }
@@ -1190,6 +1219,7 @@ static void ceph_aio_retry_work(struct work_struct *work);
 static void ceph_aio_complete(struct inode *inode,
 			      struct ceph_aio_request *aio_req)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int ret;
 
@@ -1203,7 +1233,7 @@ static void ceph_aio_complete(struct inode *inode,
 	if (!ret)
 		ret = aio_req->total_len;
 
-	dout("ceph_aio_complete %p rc %d\n", inode, ret);
+	doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret);
 
 	if (ret >= 0 && aio_req->write) {
 		int dirty;
@@ -1242,11 +1272,13 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
 	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
 	unsigned int len = osd_data->bvec_pos.iter.bi_size;
 	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
 	BUG_ON(!osd_data->num_bvecs);
 
-	dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
+	doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req,
+	      inode, ceph_vinop(inode), rc, len);
 
 	if (rc == -EOLDSNAPC) {
 		struct ceph_aio_work *aio_work;
@@ -1256,7 +1288,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
 		if (aio_work) {
 			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
 			aio_work->req = req;
-			queue_work(ceph_inode_to_client(inode)->inode_wq,
+			queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
 				   &aio_work->work);
 			return;
 		}
@@ -1386,7 +1418,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_client_metric *metric = &fsc->mdsc->metric;
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
@@ -1405,9 +1438,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
-	     (write ? "write" : "read"), file, pos, (unsigned)count,
-	     snapc, snapc ? snapc->seq : 0);
+	doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+	      (write ? "write" : "read"), file, pos, (unsigned)count,
+	      snapc, snapc ? snapc->seq : 0);
 
 	if (write) {
 		int ret2;
@@ -1418,7 +1451,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					pos >> PAGE_SHIFT,
 					(pos + count - 1) >> PAGE_SHIFT);
 		if (ret2 < 0)
-			dout("invalidate_inode_pages2_range returned %d\n", ret2);
+			doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+			      ret2);
 
 		flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
 	} else {
@@ -1610,7 +1644,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_osd_request *req;
 	struct page **pages;
@@ -1625,8 +1660,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
-	     file, pos, (unsigned)count, snapc, snapc->seq);
+	doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos,
+	      (unsigned)count, snapc, snapc->seq);
 
 	ret = filemap_write_and_wait_range(inode->i_mapping,
 					   pos, pos + count - 1);
@@ -1670,9 +1705,9 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		last = (pos + len) != (write_pos + write_len);
 		rmw = first || last;
 
-		dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
-		     ci->i_vino.ino, pos, len, write_pos, write_len,
-		     rmw ? "" : "no ");
+		doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+		      ci->i_vino.ino, pos, len, write_pos, write_len,
+		      rmw ? "" : "no ");
 
 		/*
 		 * The data is emplaced into the page as it would be if it were
@@ -1881,7 +1916,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 			left -= ret;
 		}
 		if (ret < 0) {
-			dout("sync_write write failed with %d\n", ret);
+			doutc(cl, "write failed with %d\n", ret);
 			ceph_release_page_vector(pages, num_pages);
 			break;
 		}
@@ -1891,7 +1926,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 							 write_pos, write_len,
 							 GFP_KERNEL);
 			if (ret < 0) {
-				dout("encryption failed with %d\n", ret);
+				doutc(cl, "encryption failed with %d\n", ret);
 				ceph_release_page_vector(pages, num_pages);
 				break;
 			}
@@ -1910,7 +1945,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 			break;
 		}
 
-		dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+		doutc(cl, "write op %lld~%llu\n", write_pos, write_len);
 		osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
 						 offset_in_page(write_pos), false,
 						 true);
@@ -1941,7 +1976,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 					  req->r_end_latency, len, ret);
 		ceph_osdc_put_request(req);
 		if (ret != 0) {
-			dout("sync_write osd write returned %d\n", ret);
+			doutc(cl, "osd write returned %d\n", ret);
 			/* Version changed! Must re-do the rmw cycle */
 			if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
 			    (!assert_ver && ret == -EEXIST)) {
@@ -1971,13 +2006,13 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 				pos >> PAGE_SHIFT,
 				(pos + len - 1) >> PAGE_SHIFT);
 		if (ret < 0) {
-			dout("invalidate_inode_pages2_range returned %d\n",
-			     ret);
+			doutc(cl, "invalidate_inode_pages2_range returned %d\n",
+			      ret);
 			ret = 0;
 		}
 		pos += len;
 		written += len;
-		dout("sync_write written %d\n", written);
+		doutc(cl, "written %d\n", written);
 		if (pos > i_size_read(inode)) {
 			check_caps = ceph_inode_set_size(inode, pos);
 			if (check_caps)
@@ -1991,7 +2026,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		ret = written;
 		iocb->ki_pos = pos;
 	}
-	dout("sync_write returning %d\n", ret);
+	doutc(cl, "returning %d\n", ret);
 	return ret;
 }
 
@@ -2010,13 +2045,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	ssize_t ret;
 	int want = 0, got = 0;
 	int retry_op = 0, read = 0;
 
 again:
-	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
-	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+	doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n",
+	      iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode));
 
 	if (ceph_inode_is_shutdown(inode))
 		return -ESTALE;
@@ -2044,9 +2080,9 @@ again:
 	    (iocb->ki_flags & IOCB_DIRECT) ||
 	    (fi->flags & CEPH_F_SYNC)) {
 
-		dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
-		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
-		     ceph_cap_string(got));
+		doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n",
+		      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		      ceph_cap_string(got));
 
 		if (!ceph_has_inline_data(ci)) {
 			if (!retry_op &&
@@ -2064,16 +2100,16 @@ again:
 		}
 	} else {
 		CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
-		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
-		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
-		     ceph_cap_string(got));
+		doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n",
+		      inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+		      ceph_cap_string(got));
 		ceph_add_rw_context(fi, &rw_ctx);
 		ret = generic_file_read_iter(iocb, to);
 		ceph_del_rw_context(fi, &rw_ctx);
 	}
 
-	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
-	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n",
+	      inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	ceph_put_cap_refs(ci, got);
 
 	if (direct_lock)
@@ -2133,8 +2169,8 @@ again:
 		/* hit EOF or hole? */
 		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
 		    ret < len) {
-			dout("sync_read hit hole, ppos %lld < size %lld"
-			     ", reading more\n", iocb->ki_pos, i_size);
+			doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n",
+			      iocb->ki_pos, i_size);
 
 			read += ret;
 			len -= ret;
@@ -2228,7 +2264,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_cap_flush *prealloc_cf;
 	ssize_t count, written = 0;
@@ -2296,8 +2333,9 @@ retry_snap:
 	if (err)
 		goto out;
 
-	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
+	doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+	      inode, ceph_vinop(inode), pos, count,
+	      i_size_read(inode));
 	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
 		want |= CEPH_CAP_FILE_BUFFER;
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -2313,8 +2351,8 @@ retry_snap:
 
 	inode_inc_iversion_raw(inode);
 
-	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+	doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n",
+	      inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
 	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
@@ -2374,14 +2412,14 @@ retry_snap:
 			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
 	}
 
-	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, (unsigned)count,
-	     ceph_cap_string(got));
+	doutc(cl, "%p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	      inode, ceph_vinop(inode), pos, (unsigned)count,
+	      ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
 
 	if (written == -EOLDSNAPC) {
-		dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
-		     inode, ceph_vinop(inode), pos, (unsigned)count);
+		doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+		      inode, ceph_vinop(inode), pos, (unsigned)count);
 		goto retry_snap;
 	}
 
@@ -2462,7 +2500,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 				    loff_t offset, loff_t *length)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	struct ceph_osd_request *req;
 	int ret = 0;
 	loff_t zero = 0;
@@ -2489,7 +2527,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	req->r_mtime = inode->i_mtime;
+	req->r_mtime = inode_get_mtime(inode);
 	ceph_osdc_start_request(&fsc->client->osdc, req);
 	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	if (ret == -ENOENT)
@@ -2553,14 +2591,15 @@ static long ceph_fallocate(struct file *file, int mode,
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap_flush *prealloc_cf;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int want, got = 0;
 	int dirty;
 	int ret = 0;
 	loff_t endoff = 0;
 	loff_t size;
 
-	dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__,
-	     inode, ceph_vinop(inode), mode, offset, length);
+	doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
+	      inode, ceph_vinop(inode), mode, offset, length);
 
 	if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
@@ -2689,6 +2728,7 @@ static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
 static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
 			   loff_t src_off, loff_t dst_off, size_t len)
 {
+	struct ceph_client *cl = ceph_inode_to_client(src_inode);
 	loff_t size, endoff;
 
 	size = i_size_read(src_inode);
@@ -2699,8 +2739,8 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
 	 * inode.
 	 */
 	if (src_off + len > size) {
-		dout("Copy beyond EOF (%llu + %zu > %llu)\n",
-		     src_off, len, size);
+		doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off,
+		      len, size);
 		return -EOPNOTSUPP;
 	}
 	size = i_size_read(dst_inode);
@@ -2776,6 +2816,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
 	u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
 	u32 src_objlen, dst_objlen;
 	u32 object_size = src_ci->i_layout.object_size;
+	struct ceph_client *cl = fsc->client;
 	int ret;
 
 	src_oloc.pool = src_ci->i_layout.pool_id;
@@ -2817,9 +2858,10 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
 		if (ret) {
 			if (ret == -EOPNOTSUPP) {
 				fsc->have_copy_from2 = false;
-				pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+				pr_notice_client(cl,
+					"OSDs don't support copy-from2; disabling copy offload\n");
 			}
-			dout("ceph_osdc_copy_from returned %d\n", ret);
+			doutc(cl, "returned %d\n", ret);
 			if (!bytes)
 				bytes = ret;
 			goto out;
@@ -2845,7 +2887,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	struct ceph_inode_info *src_ci = ceph_inode(src_inode);
 	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
 	struct ceph_cap_flush *prealloc_cf;
-	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
+	struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode);
+	struct ceph_client *cl = src_fsc->client;
 	loff_t size;
 	ssize_t ret = -EIO, bytes;
 	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
@@ -2853,7 +2896,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	int src_got = 0, dst_got = 0, err, dirty;
 
 	if (src_inode->i_sb != dst_inode->i_sb) {
-		struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
+		struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode);
 
 		if (ceph_fsid_compare(&src_fsc->client->fsid,
 				      &dst_fsc->client->fsid)) {
@@ -2888,7 +2931,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	    (src_ci->i_layout.stripe_count != 1) ||
 	    (dst_ci->i_layout.stripe_count != 1) ||
 	    (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
-		dout("Invalid src/dst files layout\n");
+		doutc(cl, "Invalid src/dst files layout\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -2906,12 +2949,12 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	/* Start by sync'ing the source and destination files */
 	ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
 	if (ret < 0) {
-		dout("failed to write src file (%zd)\n", ret);
+		doutc(cl, "failed to write src file (%zd)\n", ret);
 		goto out;
 	}
 	ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
 	if (ret < 0) {
-		dout("failed to write dst file (%zd)\n", ret);
+		doutc(cl, "failed to write dst file (%zd)\n", ret);
 		goto out;
 	}
 
@@ -2923,7 +2966,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	err = get_rd_wr_caps(src_file, &src_got,
 			     dst_file, (dst_off + len), &dst_got);
 	if (err < 0) {
-		dout("get_rd_wr_caps returned %d\n", err);
+		doutc(cl, "get_rd_wr_caps returned %d\n", err);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -2938,7 +2981,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 					    dst_off >> PAGE_SHIFT,
 					    (dst_off + len) >> PAGE_SHIFT);
 	if (ret < 0) {
-		dout("Failed to invalidate inode pages (%zd)\n", ret);
+		doutc(cl, "Failed to invalidate inode pages (%zd)\n",
+			    ret);
 		ret = 0; /* XXX */
 	}
 	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
@@ -2959,7 +3003,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 	 * starting at the src_off
 	 */
 	if (src_objoff) {
-		dout("Initial partial copy of %u bytes\n", src_objlen);
+		doutc(cl, "Initial partial copy of %u bytes\n", src_objlen);
 
 		/*
 		 * we need to temporarily drop all caps as we'll be calling
@@ -2970,7 +3014,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 				       &dst_off, src_objlen, flags);
 		/* Abort on short copies or on error */
 		if (ret < (long)src_objlen) {
-			dout("Failed partial copy (%zd)\n", ret);
+			doutc(cl, "Failed partial copy (%zd)\n", ret);
 			goto out;
 		}
 		len -= ret;
@@ -2992,7 +3036,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 			ret = bytes;
 		goto out_caps;
 	}
-	dout("Copied %zu bytes out of %zu\n", bytes, len);
+	doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len);
 	len -= bytes;
 	ret += bytes;
 
@@ -3020,13 +3064,13 @@ out_caps:
 	 * there were errors in remote object copies (len >= object_size).
 	 */
 	if (len && (len < src_ci->i_layout.object_size)) {
-		dout("Final partial copy of %zu bytes\n", len);
+		doutc(cl, "Final partial copy of %zu bytes\n", len);
 		bytes = do_splice_direct(src_file, &src_off, dst_file,
 					 &dst_off, len, flags);
 		if (bytes > 0)
 			ret += bytes;
 		else
-			dout("Failed partial copy (%zd)\n", bytes);
+			doutc(cl, "Failed partial copy (%zd)\n", bytes);
 	}
 
 out:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b79100f720b3..0679240f06db 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -129,6 +129,8 @@ void ceph_as_ctx_to_req(struct ceph_mds_request *req,
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
 			     struct inode *newino)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct inode *inode;
 
 	if (ceph_vino_is_reserved(vino))
@@ -145,12 +147,13 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
 	}
 
 	if (!inode) {
-		dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
+		doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
-	     ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
+	doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
+	      ceph_present_inode(inode), ceph_vinop(inode), inode,
+	      !!(inode->i_state & I_NEW));
 	return inode;
 }
 
@@ -159,6 +162,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
  */
 struct inode *ceph_get_snapdir(struct inode *parent)
 {
+	struct ceph_client *cl = ceph_inode_to_client(parent);
 	struct ceph_vino vino = {
 		.ino = ceph_ino(parent),
 		.snap = CEPH_SNAPDIR,
@@ -171,23 +175,23 @@ struct inode *ceph_get_snapdir(struct inode *parent)
 		return inode;
 
 	if (!S_ISDIR(parent->i_mode)) {
-		pr_warn_once("bad snapdir parent type (mode=0%o)\n",
-			     parent->i_mode);
+		pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
+				    parent->i_mode);
 		goto err;
 	}
 
 	if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
-		pr_warn_once("bad snapdir inode type (mode=0%o)\n",
-			     inode->i_mode);
+		pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
+				    inode->i_mode);
 		goto err;
 	}
 
 	inode->i_mode = parent->i_mode;
 	inode->i_uid = parent->i_uid;
 	inode->i_gid = parent->i_gid;
-	inode->i_mtime = parent->i_mtime;
+	inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
 	inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
-	inode->i_atime = parent->i_atime;
+	inode_set_atime_to_ts(inode, inode_get_atime(parent));
 	ci->i_rbytes = 0;
 	ci->i_btime = ceph_inode(parent)->i_btime;
 
@@ -203,7 +207,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
 			inode->i_flags |= S_ENCRYPTED;
 			ci->fscrypt_auth_len = pci->fscrypt_auth_len;
 		} else {
-			dout("Failed to alloc snapdir fscrypt_auth\n");
+			doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
 			ret = -ENOMEM;
 			goto err;
 		}
@@ -249,6 +253,8 @@ const struct inode_operations ceph_file_iops = {
 static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
 						    u32 f)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_frag *frag;
@@ -279,8 +285,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
 	rb_link_node(&frag->node, parent, p);
 	rb_insert_color(&frag->node, &ci->i_fragtree);
 
-	dout("get_or_create_frag added %llx.%llx frag %x\n",
-	     ceph_vinop(&ci->netfs.inode), f);
+	doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
 	return frag;
 }
 
@@ -313,6 +318,7 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
 static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 			      struct ceph_inode_frag *pfrag, int *found)
 {
+	struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
 	u32 t = ceph_frag_make(0, 0);
 	struct ceph_inode_frag *frag;
 	unsigned nway, i;
@@ -336,8 +342,8 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 
 		/* choose child */
 		nway = 1 << frag->split_by;
-		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
-		     frag->split_by, nway);
+		doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
+		      frag->split_by, nway);
 		for (i = 0; i < nway; i++) {
 			n = ceph_frag_make_child(t, frag->split_by, i);
 			if (ceph_frag_contains_value(n, v)) {
@@ -347,7 +353,7 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
 		}
 		BUG_ON(i == nway);
 	}
-	dout("choose_frag(%x) = %x\n", v, t);
+	doutc(cl, "frag(%x) = %x\n", v, t);
 
 	return t;
 }
@@ -371,6 +377,7 @@ static int ceph_fill_dirfrag(struct inode *inode,
 			     struct ceph_mds_reply_dirfrag *dirinfo)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_frag *frag;
 	u32 id = le32_to_cpu(dirinfo->frag);
 	int mds = le32_to_cpu(dirinfo->auth);
@@ -395,14 +402,14 @@ static int ceph_fill_dirfrag(struct inode *inode,
 			goto out;
 		if (frag->split_by == 0) {
 			/* tree leaf, remove */
-			dout("fill_dirfrag removed %llx.%llx frag %x"
-			     " (no ref)\n", ceph_vinop(inode), id);
+			doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
+			      inode, ceph_vinop(inode), id);
 			rb_erase(&frag->node, &ci->i_fragtree);
 			kfree(frag);
 		} else {
 			/* tree branch, keep and clear */
-			dout("fill_dirfrag cleared %llx.%llx frag %x"
-			     " referral\n", ceph_vinop(inode), id);
+			doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
+			      inode, ceph_vinop(inode), id);
 			frag->mds = -1;
 			frag->ndist = 0;
 		}
@@ -415,8 +422,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
 	if (IS_ERR(frag)) {
 		/* this is not the end of the world; we can continue
 		   with bad/inaccurate delegation info */
-		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
-		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
+			      inode, ceph_vinop(inode),
+			      le32_to_cpu(dirinfo->frag));
 		err = -ENOMEM;
 		goto out;
 	}
@@ -425,8 +433,8 @@ static int ceph_fill_dirfrag(struct inode *inode,
 	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
 	for (i = 0; i < frag->ndist; i++)
 		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
-	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
-	     ceph_vinop(inode), frag->frag, frag->ndist);
+	doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
+	      ceph_vinop(inode), frag->frag, frag->ndist);
 
 out:
 	mutex_unlock(&ci->i_fragtree_mutex);
@@ -454,6 +462,7 @@ static int ceph_fill_fragtree(struct inode *inode,
 			      struct ceph_frag_tree_head *fragtree,
 			      struct ceph_mds_reply_dirfrag *dirinfo)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_inode_frag *frag, *prev_frag = NULL;
 	struct rb_node *rb_node;
@@ -489,15 +498,15 @@ static int ceph_fill_fragtree(struct inode *inode,
 		     frag_tree_split_cmp, NULL);
 	}
 
-	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	rb_node = rb_first(&ci->i_fragtree);
 	for (i = 0; i < nsplits; i++) {
 		id = le32_to_cpu(fragtree->splits[i].frag);
 		split_by = le32_to_cpu(fragtree->splits[i].by);
 		if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
-			pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
-			       "frag %x split by %d\n", ceph_vinop(inode),
-			       i, nsplits, id, split_by);
+			pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
+			       "frag %x split by %d\n", inode,
+			       ceph_vinop(inode), i, nsplits, id, split_by);
 			continue;
 		}
 		frag = NULL;
@@ -529,7 +538,7 @@ static int ceph_fill_fragtree(struct inode *inode,
 		if (frag->split_by == 0)
 			ci->i_fragtree_nsplits++;
 		frag->split_by = split_by;
-		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+		doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
 		prev_frag = frag;
 	}
 	while (rb_node) {
@@ -554,6 +563,7 @@ out_unlock:
  */
 struct inode *ceph_alloc_inode(struct super_block *sb)
 {
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	struct ceph_inode_info *ci;
 	int i;
 
@@ -561,7 +571,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	if (!ci)
 		return NULL;
 
-	dout("alloc_inode %p\n", &ci->netfs.inode);
+	doutc(fsc->client, "%p\n", &ci->netfs.inode);
 
 	/* Set parameters for the netfs library */
 	netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
@@ -675,10 +685,11 @@ void ceph_evict_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_frag *frag;
 	struct rb_node *n;
 
-	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+	doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
 
 	percpu_counter_dec(&mdsc->metric.total_inodes);
 
@@ -701,8 +712,8 @@ void ceph_evict_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		if (ceph_snap(inode) == CEPH_NOSNAP) {
-			dout(" dropping residual ref to snap realm %p\n",
-			     ci->i_snap_realm);
+			doutc(cl, " dropping residual ref to snap realm %p\n",
+			      ci->i_snap_realm);
 			ceph_change_snap_realm(inode, NULL);
 		} else {
 			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
@@ -743,15 +754,16 @@ static inline blkcnt_t calc_inode_blocks(u64 size)
 int ceph_fill_file_size(struct inode *inode, int issued,
 			u32 truncate_seq, u64 truncate_size, u64 size)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int queue_trunc = 0;
 	loff_t isize = i_size_read(inode);
 
 	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
 	    (truncate_seq == ci->i_truncate_seq && size > isize)) {
-		dout("size %lld -> %llu\n", isize, size);
+		doutc(cl, "size %lld -> %llu\n", isize, size);
 		if (size > 0 && S_ISDIR(inode->i_mode)) {
-			pr_err("fill_file_size non-zero size for directory\n");
+			pr_err_client(cl, "non-zero size for directory\n");
 			size = 0;
 		}
 		i_size_write(inode, size);
@@ -764,8 +776,8 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			ceph_fscache_update(inode);
 		ci->i_reported_size = size;
 		if (truncate_seq != ci->i_truncate_seq) {
-			dout("%s truncate_seq %u -> %u\n", __func__,
-			     ci->i_truncate_seq, truncate_seq);
+			doutc(cl, "truncate_seq %u -> %u\n",
+			      ci->i_truncate_seq, truncate_seq);
 			ci->i_truncate_seq = truncate_seq;
 
 			/* the MDS should have revoked these caps */
@@ -794,14 +806,15 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 	 * anyway.
 	 */
 	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
-		dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
-		     ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+		doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
+		      ci->i_truncate_size, truncate_size,
+		      !!IS_ENCRYPTED(inode));
 
 		ci->i_truncate_size = truncate_size;
 
 		if (IS_ENCRYPTED(inode)) {
-			dout("%s truncate_pagecache_size %lld -> %llu\n",
-			     __func__, ci->i_truncate_pagecache_size, size);
+			doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
+			      ci->i_truncate_pagecache_size, size);
 			ci->i_truncate_pagecache_size = size;
 		} else {
 			ci->i_truncate_pagecache_size = truncate_size;
@@ -814,6 +827,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 			 u64 time_warp_seq, struct timespec64 *ctime,
 			 struct timespec64 *mtime, struct timespec64 *atime)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct timespec64 ictime = inode_get_ctime(inode);
 	int warn = 0;
@@ -825,7 +839,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		      CEPH_CAP_XATTR_EXCL)) {
 		if (ci->i_version == 0 ||
 		    timespec64_compare(ctime, &ictime) > 0) {
-			dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
+			doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
 			     ictime.tv_sec, ictime.tv_nsec,
 			     ctime->tv_sec, ctime->tv_nsec);
 			inode_set_ctime_to_ts(inode, *ctime);
@@ -833,30 +847,32 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		if (ci->i_version == 0 ||
 		    ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
 			/* the MDS did a utimes() */
-			dout("mtime %lld.%09ld -> %lld.%09ld "
-			     "tw %d -> %d\n",
-			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n",
+			     inode_get_mtime_sec(inode),
+			     inode_get_mtime_nsec(inode),
 			     mtime->tv_sec, mtime->tv_nsec,
 			     ci->i_time_warp_seq, (int)time_warp_seq);
 
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode_set_mtime_to_ts(inode, *mtime);
+			inode_set_atime_to_ts(inode, *atime);
 			ci->i_time_warp_seq = time_warp_seq;
 		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			struct timespec64	ts;
+
 			/* nobody did utimes(); take the max */
-			if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
-				dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
-				     inode->i_mtime.tv_sec,
-				     inode->i_mtime.tv_nsec,
+			ts = inode_get_mtime(inode);
+			if (timespec64_compare(mtime, &ts) > 0) {
+				doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n",
+				     ts.tv_sec, ts.tv_nsec,
 				     mtime->tv_sec, mtime->tv_nsec);
-				inode->i_mtime = *mtime;
+				inode_set_mtime_to_ts(inode, *mtime);
 			}
-			if (timespec64_compare(atime, &inode->i_atime) > 0) {
-				dout("atime %lld.%09ld -> %lld.%09ld inc\n",
-				     inode->i_atime.tv_sec,
-				     inode->i_atime.tv_nsec,
+			ts = inode_get_atime(inode);
+			if (timespec64_compare(atime, &ts) > 0) {
+				doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n",
+				     ts.tv_sec, ts.tv_nsec,
 				     atime->tv_sec, atime->tv_nsec);
-				inode->i_atime = *atime;
+				inode_set_atime_to_ts(inode, *atime);
 			}
 		} else if (issued & CEPH_CAP_FILE_EXCL) {
 			/* we did a utimes(); ignore mds values */
@@ -867,21 +883,24 @@ void ceph_fill_file_time(struct inode *inode, int issued,
 		/* we have no write|excl caps; whatever the MDS says is true */
 		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
 			inode_set_ctime_to_ts(inode, *ctime);
-			inode->i_mtime = *mtime;
-			inode->i_atime = *atime;
+			inode_set_mtime_to_ts(inode, *mtime);
+			inode_set_atime_to_ts(inode, *atime);
 			ci->i_time_warp_seq = time_warp_seq;
 		} else {
 			warn = 1;
 		}
 	}
 	if (warn) /* time_warp_seq shouldn't go backwards */
-		dout("%p mds time_warp_seq %llu < %u\n",
-		     inode, time_warp_seq, ci->i_time_warp_seq);
+		doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
+		      time_warp_seq, ci->i_time_warp_seq);
 }
 
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+				    const char *encsym,
+				    int enclen, u8 **decsym)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int declen;
 	u8 *sym;
 
@@ -891,8 +910,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
 
 	declen = ceph_base64_decode(encsym, enclen, sym);
 	if (declen < 0) {
-		pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
-		       __func__, declen, enclen, encsym);
+		pr_err_client(cl,
+			"can't decode symlink (%d). Content: %.*s\n",
+			declen, enclen, encsym);
 		kfree(sym);
 		return -EIO;
 	}
@@ -901,7 +921,9 @@ static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
 	return declen;
 }
 #else
-static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
+				    const char *encsym,
+				    int symlen, u8 **decsym)
 {
 	return -EOPNOTSUPP;
 }
@@ -918,6 +940,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		    struct ceph_cap_reservation *caps_reservation)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int issued, new_issued, info_caps;
@@ -936,25 +959,26 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
 	lockdep_assert_held(&mdsc->snap_rwsem);
 
-	dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
-	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
-	     ci->i_version);
+	doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
+	      le64_to_cpu(info->version), ci->i_version);
 
 	/* Once I_NEW is cleared, we can't change type or dev numbers */
 	if (inode->i_state & I_NEW) {
 		inode->i_mode = mode;
 	} else {
 		if (inode_wrong_type(inode, mode)) {
-			pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
-				     ceph_vinop(inode), inode->i_mode, mode);
+			pr_warn_once_client(cl,
+				"inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+				ceph_vinop(inode), inode->i_mode, mode);
 			return -ESTALE;
 		}
 
 		if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
-			pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
-				     ceph_vinop(inode), MAJOR(inode->i_rdev),
-				     MINOR(inode->i_rdev), MAJOR(rdev),
-				     MINOR(rdev));
+			pr_warn_once_client(cl,
+				"dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+				ceph_vinop(inode), MAJOR(inode->i_rdev),
+				MINOR(inode->i_rdev), MAJOR(rdev),
+				MINOR(rdev));
 			return -ESTALE;
 		}
 	}
@@ -976,8 +1000,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 	if (iinfo->xattr_len > 4) {
 		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
 		if (!xattr_blob)
-			pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
-			       iinfo->xattr_len);
+			pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
+				      iinfo->xattr_len);
 	}
 
 	if (iinfo->pool_ns_len > 0)
@@ -1031,9 +1055,10 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		inode->i_mode = mode;
 		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
 		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
-		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     from_kuid(&init_user_ns, inode->i_uid),
-		     from_kgid(&init_user_ns, inode->i_gid));
+		doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
+		      ceph_vinop(inode), inode->i_mode,
+		      from_kuid(&init_user_ns, inode->i_uid),
+		      from_kgid(&init_user_ns, inode->i_gid));
 		ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
 		ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
 	}
@@ -1089,7 +1114,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 			if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
 				size = fsize;
 			} else {
-				pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+				pr_warn_client(cl,
+					"fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
 					info->size, size);
 			}
 		}
@@ -1101,8 +1127,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		/* only update max_size on auth cap */
 		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 		    ci->i_max_size != le64_to_cpu(info->max_size)) {
-			dout("max_size %lld -> %llu\n", ci->i_max_size,
-					le64_to_cpu(info->max_size));
+			doutc(cl, "max_size %lld -> %llu\n",
+			    ci->i_max_size, le64_to_cpu(info->max_size));
 			ci->i_max_size = le64_to_cpu(info->max_size);
 		}
 	}
@@ -1165,15 +1191,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
 			if (IS_ENCRYPTED(inode)) {
 				if (symlen != i_size_read(inode))
-					pr_err("%s %llx.%llx BAD symlink size %lld\n",
-						__func__, ceph_vinop(inode),
+					pr_err_client(cl,
+						"%p %llx.%llx BAD symlink size %lld\n",
+						inode, ceph_vinop(inode),
 						i_size_read(inode));
 
-				err = decode_encrypted_symlink(iinfo->symlink,
+				err = decode_encrypted_symlink(mdsc, iinfo->symlink,
 							       symlen, (u8 **)&sym);
 				if (err < 0) {
-					pr_err("%s decoding encrypted symlink failed: %d\n",
-						__func__, err);
+					pr_err_client(cl,
+						"decoding encrypted symlink failed: %d\n",
+						err);
 					goto out;
 				}
 				symlen = err;
@@ -1181,8 +1209,9 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 				inode->i_blocks = calc_inode_blocks(symlen);
 			} else {
 				if (symlen != i_size_read(inode)) {
-					pr_err("%s %llx.%llx BAD symlink size %lld\n",
-						__func__, ceph_vinop(inode),
+					pr_err_client(cl,
+						"%p %llx.%llx BAD symlink size %lld\n",
+						inode, ceph_vinop(inode),
 						i_size_read(inode));
 					i_size_write(inode, symlen);
 					inode->i_blocks = calc_inode_blocks(symlen);
@@ -1217,8 +1246,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		inode->i_fop = &ceph_dir_fops;
 		break;
 	default:
-		pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
-		       ceph_vinop(inode), inode->i_mode);
+		pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
+			      ceph_vinop(inode), inode->i_mode);
 	}
 
 	/* were we issued a capability? */
@@ -1239,7 +1268,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 			    (info_caps & CEPH_CAP_FILE_SHARED) &&
 			    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 			    !__ceph_dir_is_complete(ci)) {
-				dout(" marking %p complete (empty)\n", inode);
+				doutc(cl, " marking %p complete (empty)\n",
+				      inode);
 				i_size_write(inode, 0);
 				__ceph_dir_set_complete(ci,
 					atomic64_read(&ci->i_release_count),
@@ -1248,8 +1278,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
 			wake = true;
 		} else {
-			dout(" %p got snap_caps %s\n", inode,
-			     ceph_cap_string(info_caps));
+			doutc(cl, " %p got snap_caps %s\n", inode,
+			      ceph_cap_string(info_caps));
 			ci->i_snap_caps |= info_caps;
 		}
 	}
@@ -1265,8 +1295,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 
 	if (cap_fmode >= 0) {
 		if (!info_caps)
-			pr_warn("mds issued no caps on %llx.%llx\n",
-				ceph_vinop(inode));
+			pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
+				       ceph_vinop(inode));
 		__ceph_touch_fmode(ci, mdsc, cap_fmode);
 	}
 
@@ -1312,14 +1342,14 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
 				  unsigned long from_time,
 				  struct ceph_mds_session **old_lease_session)
 {
+	struct ceph_client *cl = ceph_inode_to_client(dir);
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	unsigned mask = le16_to_cpu(lease->mask);
 	long unsigned duration = le32_to_cpu(lease->duration_ms);
 	long unsigned ttl = from_time + (duration * HZ) / 1000;
 	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
 
-	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
-	     dentry, duration, ttl);
+	doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
 
 	/* only track leases on regular dentries */
 	if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -1420,6 +1450,7 @@ out_unlock:
  */
 static int splice_dentry(struct dentry **pdn, struct inode *in)
 {
+	struct ceph_client *cl = ceph_inode_to_client(in);
 	struct dentry *dn = *pdn;
 	struct dentry *realdn;
 
@@ -1451,23 +1482,21 @@ static int splice_dentry(struct dentry **pdn, struct inode *in)
 		d_drop(dn);
 	realdn = d_splice_alias(in, dn);
 	if (IS_ERR(realdn)) {
-		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
-		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
+		pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
+			      PTR_ERR(realdn), dn, in, ceph_vinop(in));
 		return PTR_ERR(realdn);
 	}
 
 	if (realdn) {
-		dout("dn %p (%d) spliced with %p (%d) "
-		     "inode %p ino %llx.%llx\n",
-		     dn, d_count(dn),
-		     realdn, d_count(realdn),
-		     d_inode(realdn), ceph_vinop(d_inode(realdn)));
+		doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
+		      dn, d_count(dn), realdn, d_count(realdn),
+		      d_inode(realdn), ceph_vinop(d_inode(realdn)));
 		dput(dn);
 		*pdn = realdn;
 	} else {
 		BUG_ON(!ceph_dentry(dn));
-		dout("dn %p attached to %p ino %llx.%llx\n",
-		     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
+		doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
+		      d_inode(dn), ceph_vinop(d_inode(dn)));
 	}
 	return 0;
 }
@@ -1489,14 +1518,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct inode *in = NULL;
 	struct ceph_vino tvino, dvino;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+	struct ceph_client *cl = fsc->client;
 	int err = 0;
 
-	dout("fill_trace %p is_dentry %d is_target %d\n", req,
-	     rinfo->head->is_dentry, rinfo->head->is_target);
+	doutc(cl, "%p is_dentry %d is_target %d\n", req,
+	      rinfo->head->is_dentry, rinfo->head->is_target);
 
 	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
-		dout("fill_trace reply is empty!\n");
+		doutc(cl, "reply is empty!\n");
 		if (rinfo->head->result == 0 && req->r_parent)
 			ceph_invalidate_dir_request(req);
 		return 0;
@@ -1553,13 +1583,13 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
 			tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
 retry_lookup:
 			dn = d_lookup(parent, &dname);
-			dout("d_lookup on parent=%p name=%.*s got %p\n",
-			     parent, dname.len, dname.name, dn);
+			doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+			      parent, dname.len, dname.name, dn);
 
 			if (!dn) {
 				dn = d_alloc(parent, &dname);
-				dout("d_alloc %p '%.*s' = %p\n", parent,
-				     dname.len, dname.name, dn);
+				doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+				      dname.len, dname.name, dn);
 				if (!dn) {
 					dput(parent);
 					ceph_fname_free_buffer(dir, &oname);
@@ -1575,8 +1605,8 @@ retry_lookup:
 			} else if (d_really_is_positive(dn) &&
 				   (ceph_ino(d_inode(dn)) != tvino.ino ||
 				    ceph_snap(d_inode(dn)) != tvino.snap)) {
-				dout(" dn %p points to wrong inode %p\n",
-				     dn, d_inode(dn));
+				doutc(cl, " dn %p points to wrong inode %p\n",
+				      dn, d_inode(dn));
 				ceph_dir_clear_ordered(dir);
 				d_delete(dn);
 				dput(dn);
@@ -1601,8 +1631,8 @@ retry_lookup:
 				 rinfo->head->result == 0) ?  req->r_fmode : -1,
 				&req->r_caps_reservation);
 		if (err < 0) {
-			pr_err("ceph_fill_inode badness %p %llx.%llx\n",
-				in, ceph_vinop(in));
+			pr_err_client(cl, "badness %p %llx.%llx\n", in,
+				      ceph_vinop(in));
 			req->r_target_inode = NULL;
 			if (in->i_state & I_NEW)
 				discard_new_inode(in);
@@ -1652,36 +1682,32 @@ retry_lookup:
 		have_lease = have_dir_cap ||
 			le32_to_cpu(rinfo->dlease->duration_ms);
 		if (!have_lease)
-			dout("fill_trace  no dentry lease or dir cap\n");
+			doutc(cl, "no dentry lease or dir cap\n");
 
 		/* rename? */
 		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
 			struct inode *olddir = req->r_old_dentry_dir;
 			BUG_ON(!olddir);
 
-			dout(" src %p '%pd' dst %p '%pd'\n",
-			     req->r_old_dentry,
-			     req->r_old_dentry,
-			     dn, dn);
-			dout("fill_trace doing d_move %p -> %p\n",
-			     req->r_old_dentry, dn);
+			doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+			      req->r_old_dentry, req->r_old_dentry, dn, dn);
+			doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
 
 			/* d_move screws up sibling dentries' offsets */
 			ceph_dir_clear_ordered(dir);
 			ceph_dir_clear_ordered(olddir);
 
 			d_move(req->r_old_dentry, dn);
-			dout(" src %p '%pd' dst %p '%pd'\n",
-			     req->r_old_dentry,
-			     req->r_old_dentry,
-			     dn, dn);
+			doutc(cl, " src %p '%pd' dst %p '%pd'\n",
+			      req->r_old_dentry, req->r_old_dentry, dn, dn);
 
 			/* ensure target dentry is invalidated, despite
 			   rehashing bug in vfs_rename_dir */
 			ceph_invalidate_dentry_lease(dn);
 
-			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
-			     ceph_dentry(req->r_old_dentry)->offset);
+			doutc(cl, "dn %p gets new offset %lld\n",
+			      req->r_old_dentry,
+			      ceph_dentry(req->r_old_dentry)->offset);
 
 			/* swap r_dentry and r_old_dentry in case that
 			 * splice_dentry() gets called later. This is safe
@@ -1693,9 +1719,9 @@ retry_lookup:
 
 		/* null dentry? */
 		if (!rinfo->head->is_target) {
-			dout("fill_trace null dentry\n");
+			doutc(cl, "null dentry\n");
 			if (d_really_is_positive(dn)) {
-				dout("d_delete %p\n", dn);
+				doutc(cl, "d_delete %p\n", dn);
 				ceph_dir_clear_ordered(dir);
 				d_delete(dn);
 			} else if (have_lease) {
@@ -1719,9 +1745,9 @@ retry_lookup:
 				goto done;
 			dn = req->r_dentry;  /* may have spliced */
 		} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
-			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
-			     dn, d_inode(dn), ceph_vinop(d_inode(dn)),
-			     ceph_vinop(in));
+			doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
+			      dn, d_inode(dn), ceph_vinop(d_inode(dn)),
+			      ceph_vinop(in));
 			d_invalidate(dn);
 			have_lease = false;
 		}
@@ -1731,7 +1757,7 @@ retry_lookup:
 					    rinfo->dlease, session,
 					    req->r_request_started);
 		}
-		dout(" final dn %p\n", dn);
+		doutc(cl, " final dn %p\n", dn);
 	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
 		    req->r_op == CEPH_MDS_OP_MKSNAP) &&
 	           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
@@ -1742,7 +1768,8 @@ retry_lookup:
 		BUG_ON(!dir);
 		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
 		BUG_ON(!req->r_dentry);
-		dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
+		doutc(cl, " linking snapped dir %p to dn %p\n", in,
+		      req->r_dentry);
 		ceph_dir_clear_ordered(dir);
 		ihold(in);
 		err = splice_dentry(&req->r_dentry, in);
@@ -1764,7 +1791,7 @@ retry_lookup:
 					    &dvino, ptvino);
 	}
 done:
-	dout("fill_trace done err=%d\n", err);
+	doutc(cl, "done err=%d\n", err);
 	return err;
 }
 
@@ -1775,6 +1802,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 					   struct ceph_mds_session *session)
 {
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	int i, err = 0;
 
 	for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1789,14 +1817,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 		in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
 		if (IS_ERR(in)) {
 			err = PTR_ERR(in);
-			dout("new_inode badness got %d\n", err);
+			doutc(cl, "badness got %d\n", err);
 			continue;
 		}
 		rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
 				     -1, &req->r_caps_reservation);
 		if (rc < 0) {
-			pr_err("ceph_fill_inode badness on %p got %d\n",
-			       in, rc);
+			pr_err_client(cl, "inode badness on %p got %d\n", in,
+				      rc);
 			err = rc;
 			if (in->i_state & I_NEW) {
 				ihold(in);
@@ -1825,6 +1853,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
 			      struct ceph_readdir_cache_control *ctl,
 			      struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = ceph_inode_to_client(dir);
 	struct ceph_inode_info *ci = ceph_inode(dir);
 	unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
 	unsigned idx = ctl->index % nsize;
@@ -1850,11 +1879,11 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
 
 	if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
 	    req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
-		dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+		doutc(cl, "dn %p idx %d\n", dn, ctl->index);
 		ctl->dentries[idx] = dn;
 		ctl->index++;
 	} else {
-		dout("disable readdir cache\n");
+		doutc(cl, "disable readdir cache\n");
 		ctl->index = -1;
 	}
 	return 0;
@@ -1867,6 +1896,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	struct inode *inode = d_inode(parent);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	struct qstr dname;
 	struct dentry *dn;
 	struct inode *in;
@@ -1894,19 +1924,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
 	if (rinfo->dir_dir &&
 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
-		dout("readdir_prepopulate got new frag %x -> %x\n",
-		     frag, le32_to_cpu(rinfo->dir_dir->frag));
+		doutc(cl, "got new frag %x -> %x\n", frag,
+			    le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
 		if (!rinfo->hash_order)
 			req->r_readdir_offset = 2;
 	}
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
-		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
-		     rinfo->dir_nr, parent);
+		doutc(cl, "%d items under SNAPDIR dn %p\n",
+		      rinfo->dir_nr, parent);
 	} else {
-		dout("readdir_prepopulate %d items under dn %p\n",
-		     rinfo->dir_nr, parent);
+		doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
 		if (rinfo->dir_dir)
 			ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
 
@@ -1950,15 +1979,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
 retry_lookup:
 		dn = d_lookup(parent, &dname);
-		dout("d_lookup on parent=%p name=%.*s got %p\n",
-		     parent, dname.len, dname.name, dn);
+		doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
+		      parent, dname.len, dname.name, dn);
 
 		if (!dn) {
 			dn = d_alloc(parent, &dname);
-			dout("d_alloc %p '%.*s' = %p\n", parent,
-			     dname.len, dname.name, dn);
+			doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
+			      dname.len, dname.name, dn);
 			if (!dn) {
-				dout("d_alloc badness\n");
+				doutc(cl, "d_alloc badness\n");
 				err = -ENOMEM;
 				goto out;
 			}
@@ -1971,8 +2000,8 @@ retry_lookup:
 			   (ceph_ino(d_inode(dn)) != tvino.ino ||
 			    ceph_snap(d_inode(dn)) != tvino.snap)) {
 			struct ceph_dentry_info *di = ceph_dentry(dn);
-			dout(" dn %p points to wrong inode %p\n",
-			     dn, d_inode(dn));
+			doutc(cl, " dn %p points to wrong inode %p\n",
+			      dn, d_inode(dn));
 
 			spin_lock(&dn->d_lock);
 			if (di->offset > 0 &&
@@ -1994,7 +2023,7 @@ retry_lookup:
 		} else {
 			in = ceph_get_inode(parent->d_sb, tvino, NULL);
 			if (IS_ERR(in)) {
-				dout("new_inode badness\n");
+				doutc(cl, "new_inode badness\n");
 				d_drop(dn);
 				dput(dn);
 				err = PTR_ERR(in);
@@ -2005,7 +2034,8 @@ retry_lookup:
 		ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
 				      -1, &req->r_caps_reservation);
 		if (ret < 0) {
-			pr_err("ceph_fill_inode badness on %p\n", in);
+			pr_err_client(cl, "badness on %p %llx.%llx\n", in,
+				      ceph_vinop(in));
 			if (d_really_is_negative(dn)) {
 				if (in->i_state & I_NEW) {
 					ihold(in);
@@ -2022,8 +2052,8 @@ retry_lookup:
 
 		if (d_really_is_negative(dn)) {
 			if (ceph_security_xattr_deadlock(in)) {
-				dout(" skip splicing dn %p to inode %p"
-				     " (security xattr deadlock)\n", dn, in);
+				doutc(cl, " skip splicing dn %p to inode %p"
+				      " (security xattr deadlock)\n", dn, in);
 				iput(in);
 				skipped++;
 				goto next_item;
@@ -2055,17 +2085,18 @@ out:
 		req->r_readdir_cache_idx = cache_ctl.index;
 	}
 	ceph_readdir_cache_release(&cache_ctl);
-	dout("readdir_prepopulate done\n");
+	doutc(cl, "done\n");
 	return err;
 }
 
 bool ceph_inode_set_size(struct inode *inode, loff_t size)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool ret;
 
 	spin_lock(&ci->i_ceph_lock);
-	dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
+	doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
 	i_size_write(inode, size);
 	ceph_fscache_update(inode);
 	inode->i_blocks = calc_inode_blocks(size);
@@ -2079,22 +2110,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
 
 void ceph_queue_inode_work(struct inode *inode, int work_bit)
 {
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	set_bit(work_bit, &ci->i_work_mask);
 
 	ihold(inode);
 	if (queue_work(fsc->inode_wq, &ci->i_work)) {
-		dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
+		doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
+		      ceph_vinop(inode), ci->i_work_mask);
 	} else {
-		dout("queue_inode_work %p already queued, mask=%lx\n",
-		     inode, ci->i_work_mask);
+		doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
+		      inode, ceph_vinop(inode), ci->i_work_mask);
 		iput(inode);
 	}
 }
 
 static void ceph_do_invalidate_pages(struct inode *inode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u32 orig_gen;
 	int check = 0;
@@ -2104,8 +2138,9 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 	mutex_lock(&ci->i_truncate_mutex);
 
 	if (ceph_inode_is_shutdown(inode)) {
-		pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
-				    __func__, ceph_vinop(inode));
+		pr_warn_ratelimited_client(cl,
+			"%p %llx.%llx is shut down\n", inode,
+			ceph_vinop(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
 		truncate_pagecache(inode, 0);
 		mutex_unlock(&ci->i_truncate_mutex);
@@ -2113,8 +2148,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 	}
 
 	spin_lock(&ci->i_ceph_lock);
-	dout("invalidate_pages %p gen %d revoking %d\n", inode,
-	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
+	      ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
 	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
 		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
 			check = 1;
@@ -2126,21 +2161,21 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 	spin_unlock(&ci->i_ceph_lock);
 
 	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
-		pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
-		       ceph_vinop(inode));
+		pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
+			      ceph_vinop(inode));
 	}
 
 	spin_lock(&ci->i_ceph_lock);
 	if (orig_gen == ci->i_rdcache_gen &&
 	    orig_gen == ci->i_rdcache_revoking) {
-		dout("invalidate_pages %p gen %d successful\n", inode,
-		     ci->i_rdcache_gen);
+		doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
+		      ceph_vinop(inode), ci->i_rdcache_gen);
 		ci->i_rdcache_revoking--;
 		check = 1;
 	} else {
-		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
-		     inode, orig_gen, ci->i_rdcache_gen,
-		     ci->i_rdcache_revoking);
+		doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
+		      inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
+		      ci->i_rdcache_revoking);
 		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
 			check = 1;
 	}
@@ -2157,6 +2192,7 @@ out:
  */
 void __ceph_do_pending_vmtruncate(struct inode *inode)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 to;
 	int wrbuffer_refs, finish = 0;
@@ -2165,7 +2201,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
 retry:
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_truncate_pending == 0) {
-		dout("%s %p none pending\n", __func__, inode);
+		doutc(cl, "%p %llx.%llx none pending\n", inode,
+		      ceph_vinop(inode));
 		spin_unlock(&ci->i_ceph_lock);
 		mutex_unlock(&ci->i_truncate_mutex);
 		return;
@@ -2177,7 +2214,8 @@ retry:
 	 */
 	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
 		spin_unlock(&ci->i_ceph_lock);
-		dout("%s %p flushing snaps first\n", __func__, inode);
+		doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
+		      ceph_vinop(inode));
 		filemap_write_and_wait_range(&inode->i_data, 0,
 					     inode->i_sb->s_maxbytes);
 		goto retry;
@@ -2188,8 +2226,8 @@ retry:
 
 	to = ci->i_truncate_pagecache_size;
 	wrbuffer_refs = ci->i_wrbuffer_ref;
-	dout("%s %p (%d) to %lld\n", __func__, inode,
-	     ci->i_truncate_pending, to);
+	doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
+	      ci->i_truncate_pending, to);
 	spin_unlock(&ci->i_ceph_lock);
 
 	ceph_fscache_resize(inode, to);
@@ -2217,9 +2255,10 @@ static void ceph_inode_work(struct work_struct *work)
 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
 						 i_work);
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 
 	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
-		dout("writeback %p\n", inode);
+		doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
 		filemap_fdatawrite(&inode->i_data);
 	}
 	if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
@@ -2291,6 +2330,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
 				 struct ceph_mds_request *req,
 				 struct iattr *attr)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
 	loff_t pos, orig_pos = round_down(attr->ia_size,
@@ -2313,9 +2353,9 @@ static int fill_fscrypt_truncate(struct inode *inode,
 
 	issued = __ceph_caps_issued(ci, NULL);
 
-	dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
-	     i_size, attr->ia_size, ceph_cap_string(got),
-	     ceph_cap_string(issued));
+	doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
+	      i_size, attr->ia_size, ceph_cap_string(got),
+	      ceph_cap_string(issued));
 
 	/* Try to writeback the dirty pagecaches */
 	if (issued & (CEPH_CAP_FILE_BUFFER)) {
@@ -2370,8 +2410,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
 	 * If the Rados object doesn't exist, it will be set to 0.
 	 */
 	if (!objver) {
-		dout("%s hit hole, ppos %lld < size %lld\n", __func__,
-		     pos, i_size);
+		doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
 
 		header.data_len = cpu_to_le32(8 + 8 + 4);
 		header.file_offset = 0;
@@ -2380,8 +2419,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
 		header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
 		header.file_offset = cpu_to_le64(orig_pos);
 
-		dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
-		     boff, CEPH_FSCRYPT_BLOCK_SIZE);
+		doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
+		      CEPH_FSCRYPT_BLOCK_SIZE);
 
 		/* truncate and zero out the extra contents for the last block */
 		memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
@@ -2409,8 +2448,8 @@ static int fill_fscrypt_truncate(struct inode *inode,
 	}
 	req->r_pagelist = pagelist;
 out:
-	dout("%s %p size dropping cap refs on %s\n", __func__,
-	     inode, ceph_cap_string(got));
+	doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
+	      ceph_vinop(inode), ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
 	if (iov.iov_base)
 		kunmap_local(iov.iov_base);
@@ -2421,13 +2460,14 @@ out:
 	return ret;
 }
 
-int __ceph_setattr(struct inode *inode, struct iattr *attr,
-		   struct ceph_iattr *cia)
+int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+		   struct iattr *attr, struct ceph_iattr *cia)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_cap_flush *prealloc_cf;
 	loff_t isize = i_size_read(inode);
 	int issued;
@@ -2466,7 +2506,8 @@ retry:
 		}
 	}
 
-	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+	doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
+	      ceph_cap_string(issued));
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
 	if (cia && cia->fscrypt_auth) {
 		u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
@@ -2477,8 +2518,8 @@ retry:
 			goto out;
 		}
 
-		dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
-			ceph_vinop(inode), ci->fscrypt_auth_len, len);
+		doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
+		      ceph_vinop(inode), ci->fscrypt_auth_len, len);
 
 		/* It should never be re-set once set */
 		WARN_ON_ONCE(ci->fscrypt_auth);
@@ -2506,38 +2547,44 @@ retry:
 #endif /* CONFIG_FS_ENCRYPTION */
 
 	if (ia_valid & ATTR_UID) {
-		dout("setattr %p uid %d -> %d\n", inode,
-		     from_kuid(&init_user_ns, inode->i_uid),
-		     from_kuid(&init_user_ns, attr->ia_uid));
+		kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
+
+		doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
+		      ceph_vinop(inode),
+		      from_kuid(&init_user_ns, inode->i_uid),
+		      from_kuid(&init_user_ns, attr->ia_uid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
-			inode->i_uid = attr->ia_uid;
+			inode->i_uid = fsuid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   !uid_eq(attr->ia_uid, inode->i_uid)) {
+			   !uid_eq(fsuid, inode->i_uid)) {
 			req->r_args.setattr.uid = cpu_to_le32(
-				from_kuid(&init_user_ns, attr->ia_uid));
+				from_kuid(&init_user_ns, fsuid));
 			mask |= CEPH_SETATTR_UID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
 	}
 	if (ia_valid & ATTR_GID) {
-		dout("setattr %p gid %d -> %d\n", inode,
-		     from_kgid(&init_user_ns, inode->i_gid),
-		     from_kgid(&init_user_ns, attr->ia_gid));
+		kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
+
+		doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
+		      ceph_vinop(inode),
+		      from_kgid(&init_user_ns, inode->i_gid),
+		      from_kgid(&init_user_ns, attr->ia_gid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
-			inode->i_gid = attr->ia_gid;
+			inode->i_gid = fsgid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   !gid_eq(attr->ia_gid, inode->i_gid)) {
+			   !gid_eq(fsgid, inode->i_gid)) {
 			req->r_args.setattr.gid = cpu_to_le32(
-				from_kgid(&init_user_ns, attr->ia_gid));
+				from_kgid(&init_user_ns, fsgid));
 			mask |= CEPH_SETATTR_GID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
 	}
 	if (ia_valid & ATTR_MODE) {
-		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
-		     attr->ia_mode);
+		doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
+		      ceph_vinop(inode), inode->i_mode, attr->ia_mode);
 		if (issued & CEPH_CAP_AUTH_EXCL) {
 			inode->i_mode = attr->ia_mode;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
@@ -2551,20 +2598,23 @@ retry:
 	}
 
 	if (ia_valid & ATTR_ATIME) {
-		dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
-		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
-		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		struct timespec64 atime = inode_get_atime(inode);
+
+		doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n",
+		      inode, ceph_vinop(inode),
+		      atime.tv_sec, atime.tv_nsec,
+		      attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
 		if (issued & CEPH_CAP_FILE_EXCL) {
 			ci->i_time_warp_seq++;
-			inode->i_atime = attr->ia_atime;
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec64_compare(&inode->i_atime,
-					    &attr->ia_atime) < 0) {
-			inode->i_atime = attr->ia_atime;
+			   timespec64_compare(&atime,
+					      &attr->ia_atime) < 0) {
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+			   !timespec64_equal(&atime, &attr->ia_atime)) {
 			ceph_encode_timespec64(&req->r_args.setattr.atime,
 					       &attr->ia_atime);
 			mask |= CEPH_SETATTR_ATIME;
@@ -2573,7 +2623,8 @@ retry:
 		}
 	}
 	if (ia_valid & ATTR_SIZE) {
-		dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
+		doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
+		      ceph_vinop(inode), isize, attr->ia_size);
 		/*
 		 * Only when the new size is smaller and not aligned to
 		 * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
@@ -2624,20 +2675,22 @@ retry:
 		}
 	}
 	if (ia_valid & ATTR_MTIME) {
-		dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
-		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
-		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		struct timespec64 mtime = inode_get_mtime(inode);
+
+		doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n",
+		      inode, ceph_vinop(inode),
+		      mtime.tv_sec, mtime.tv_nsec,
+		      attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
 		if (issued & CEPH_CAP_FILE_EXCL) {
 			ci->i_time_warp_seq++;
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 			dirtied |= CEPH_CAP_FILE_EXCL;
 		} else if ((issued & CEPH_CAP_FILE_WR) &&
-			   timespec64_compare(&inode->i_mtime,
-					    &attr->ia_mtime) < 0) {
-			inode->i_mtime = attr->ia_mtime;
+			   timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 			dirtied |= CEPH_CAP_FILE_WR;
 		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			   !timespec64_equal(&mtime, &attr->ia_mtime)) {
 			ceph_encode_timespec64(&req->r_args.setattr.mtime,
 					       &attr->ia_mtime);
 			mask |= CEPH_SETATTR_MTIME;
@@ -2650,11 +2703,12 @@ retry:
 	if (ia_valid & ATTR_CTIME) {
 		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
 					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
-		dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
-		     inode_get_ctime(inode).tv_sec,
-		     inode_get_ctime(inode).tv_nsec,
-		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
-		     only ? "ctime only" : "ignored");
+		doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n",
+		      inode, ceph_vinop(inode),
+		      inode_get_ctime_sec(inode),
+		      inode_get_ctime_nsec(inode),
+		      attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		      only ? "ctime only" : "ignored");
 		if (only) {
 			/*
 			 * if kernel wants to dirty ctime but nothing else,
@@ -2672,7 +2726,8 @@ retry:
 		}
 	}
 	if (ia_valid & ATTR_FILE)
-		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+		doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
+		      ceph_vinop(inode));
 
 	if (dirtied) {
 		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
@@ -2713,16 +2768,17 @@ retry:
 		 */
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err == -EAGAIN && truncate_retry--) {
-			dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
-			     inode, err, ceph_cap_string(dirtied), mask);
+			doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
+			      inode, ceph_vinop(inode), err,
+			      ceph_cap_string(dirtied), mask);
 			ceph_mdsc_put_request(req);
 			ceph_free_cap_flush(prealloc_cf);
 			goto retry;
 		}
 	}
 out:
-	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
-	     ceph_cap_string(dirtied), mask);
+	doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
+	      ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
 
 	ceph_mdsc_put_request(req);
 	ceph_free_cap_flush(prealloc_cf);
@@ -2740,7 +2796,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		 struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	int err;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -2753,7 +2809,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (err)
 		return err;
 
-	err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+	err = setattr_prepare(idmap, dentry, attr);
 	if (err != 0)
 		return err;
 
@@ -2765,10 +2821,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	    ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
 		return -EDQUOT;
 
-	err = __ceph_setattr(inode, attr, NULL);
+	err = __ceph_setattr(idmap, inode, attr, NULL);
 
 	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
-		err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
+		err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
 
 	return err;
 }
@@ -2810,19 +2866,21 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
 int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 		      int mask, bool force)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int mode;
 	int err;
 
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
-		dout("do_getattr inode %p SNAPDIR\n", inode);
+		doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
+		      ceph_vinop(inode));
 		return 0;
 	}
 
-	dout("do_getattr inode %p mask %s mode 0%o\n",
-	     inode, ceph_cap_string(mask), inode->i_mode);
+	doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
+	      ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
 	if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
 			return 0;
 
@@ -2849,14 +2907,15 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 		}
 	}
 	ceph_mdsc_put_request(req);
-	dout("do_getattr result=%d\n", err);
+	doutc(cl, "result=%d\n", err);
 	return err;
 }
 
 int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
 		      size_t size)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int mode = USE_AUTH_MDS;
@@ -2886,7 +2945,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
 	xattr_value = req->r_reply_info.xattr_info.xattr_value;
 	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
 
-	dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+	doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
 
 	err = (int)xattr_value_len;
 	if (size == 0)
@@ -2901,7 +2960,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
 put:
 	ceph_mdsc_put_request(req);
 out:
-	dout("do_getvxattr result=%d\n", err);
+	doutc(cl, "result=%d\n", err);
 	return err;
 }
 
@@ -2921,7 +2980,7 @@ int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
 	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
 
 	if (!err)
-		err = generic_permission(&nop_mnt_idmap, inode, mask);
+		err = generic_permission(idmap, inode, mask);
 	return err;
 }
 
@@ -2978,7 +3037,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
 			return err;
 	}
 
-	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	generic_fillattr(idmap, request_mask, inode, stat);
 	stat->ino = ceph_present_inode(inode);
 
 	/*
@@ -3001,7 +3060,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
 		stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
 
 	if (S_ISDIR(inode->i_mode)) {
-		if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
+		if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
 			stat->size = ci->i_rbytes;
 		} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
 			struct ceph_inode_info *pci;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 91a84917d203..e861de3c79b9 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -65,7 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
 	struct inode *inode = file_inode(file);
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
@@ -140,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	int err;
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
 
 	/* copy and validate */
 	if (copy_from_user(&l, arg, sizeof(l)))
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
-		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+		&ceph_sb_to_fs_client(inode->i_sb)->client->osdc;
 	struct ceph_object_locator oloc;
 	CEPH_DEFINE_OID_ONSTACK(oid);
 	u32 xlen;
@@ -244,7 +244,8 @@ static long ceph_ioctl_lazyio(struct file *file)
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 
 	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
 		spin_lock(&ci->i_ceph_lock);
@@ -252,11 +253,13 @@ static long ceph_ioctl_lazyio(struct file *file)
 		ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
 		__ceph_touch_fmode(ci, mdsc, fi->fmode);
 		spin_unlock(&ci->i_ceph_lock);
-		dout("ioctl_layzio: file %p marked lazy\n", file);
+		doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode,
+		      ceph_vinop(inode));
 
 		ceph_check_caps(ci, 0);
 	} else {
-		dout("ioctl_layzio: file %p already lazy\n", file);
+		doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode,
+		      ceph_vinop(inode));
 	}
 	return 0;
 }
@@ -355,10 +358,12 @@ static const char *ceph_ioctl_cmd_name(const unsigned int cmd)
 
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = file_inode(file);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	int ret;
 
-	dout("ioctl file %p cmd %s arg %lu\n", file,
-	     ceph_ioctl_cmd_name(cmd), arg);
+	doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file,
+	      inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg);
 	switch (cmd) {
 	case CEPH_IOC_GET_LAYOUT:
 		return ceph_ioctl_get_layout(file, (void __user *)arg);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index cb51c7e9c8e2..e07ad29ff8b9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -77,6 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 			     int cmd, u8 wait, struct file_lock *fl)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	int err;
 	u64 length = 0;
@@ -111,10 +112,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 
 	owner = secure_addr(fl->fl_owner);
 
-	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
-	     "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
-	     (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
-	     wait, fl->fl_type);
+	doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
+		    "start: %llu, length: %llu, wait: %d, type: %d\n",
+		    (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
+		    fl->fl_start, length, wait, fl->fl_type);
 
 	req->r_args.filelock_change.rule = lock_type;
 	req->r_args.filelock_change.type = cmd;
@@ -147,16 +148,17 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 
 	}
 	ceph_mdsc_put_request(req);
-	dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
-	     "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
-	     (int)operation, (u64)fl->fl_pid, fl->fl_start,
-	     length, wait, fl->fl_type, err);
+	doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
+	      "length: %llu, wait: %d, type: %d, err code %d\n",
+	      (int)lock_type, (int)operation, (u64)fl->fl_pid,
+	      fl->fl_start, length, wait, fl->fl_type, err);
 	return err;
 }
 
 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
                                          struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *intr_req;
 	struct inode *inode = req->r_inode;
 	int err, lock_type;
@@ -174,8 +176,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 	if (!err)
 		return 0;
 
-	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
-	     req->r_tid);
+	doutc(cl, "request %llu was interrupted\n", req->r_tid);
 
 	mutex_lock(&mdsc->mutex);
 	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
@@ -246,6 +247,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int err = 0;
 	u16 op = CEPH_MDS_OP_SETFILELOCK;
 	u8 wait = 0;
@@ -257,7 +259,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	if (ceph_inode_is_shutdown(inode))
 		return -ESTALE;
 
-	dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
+	doutc(cl, "fl_owner: %p\n", fl->fl_owner);
 
 	/* set wait bit as appropriate, then make command as Ceph expects it*/
 	if (IS_GETLK(cmd))
@@ -292,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 	if (!err) {
 		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
-			dout("mds locked, locking locally\n");
+			doutc(cl, "locking locally\n");
 			err = posix_lock_file(file, fl, NULL);
 			if (err) {
 				/* undo! This should only happen if
@@ -300,8 +302,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 				 * deadlock. */
 				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 						  CEPH_LOCK_UNLOCK, 0, fl);
-				dout("got %d on posix_lock_file, undid lock\n",
-				     err);
+				doutc(cl, "got %d on posix_lock_file, undid lock\n",
+				      err);
 			}
 		}
 	}
@@ -312,6 +314,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int err = 0;
 	u8 wait = 0;
 	u8 lock_cmd;
@@ -322,7 +325,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (ceph_inode_is_shutdown(inode))
 		return -ESTALE;
 
-	dout("ceph_flock, fl_file: %p\n", fl->fl_file);
+	doutc(cl, "fl_file: %p\n", fl->fl_file);
 
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -359,7 +362,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 			ceph_lock_message(CEPH_LOCK_FLOCK,
 					  CEPH_MDS_OP_SETFILELOCK,
 					  inode, CEPH_LOCK_UNLOCK, 0, fl);
-			dout("got %d on locks_lock_file_wait, undid lock\n", err);
+			doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
+			      err);
 		}
 	}
 	return err;
@@ -371,6 +375,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
  */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct file_lock *lock;
 	struct file_lock_context *ctx;
 
@@ -386,17 +391,20 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 			++(*flock_count);
 		spin_unlock(&ctx->flc_lock);
 	}
-	dout("counted %d flock locks and %d fcntl locks\n",
-	     *flock_count, *fcntl_count);
+	doutc(cl, "counted %d flock locks and %d fcntl locks\n",
+	      *flock_count, *fcntl_count);
 }
 
 /*
  * Given a pointer to a lock, convert it to a ceph filelock
  */
-static int lock_to_ceph_filelock(struct file_lock *lock,
+static int lock_to_ceph_filelock(struct inode *inode,
+				 struct file_lock *lock,
 				 struct ceph_filelock *cephlock)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int err = 0;
+
 	cephlock->start = cpu_to_le64(lock->fl_start);
 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 	cephlock->client = cpu_to_le64(0);
@@ -414,7 +422,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
 		cephlock->type = CEPH_LOCK_UNLOCK;
 		break;
 	default:
-		dout("Have unknown lock type %d\n", lock->fl_type);
+		doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
 		err = -EINVAL;
 	}
 
@@ -432,13 +440,14 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 {
 	struct file_lock *lock;
 	struct file_lock_context *ctx = locks_inode_context(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
 	int l = 0;
 
-	dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
-	     num_fcntl_locks);
+	doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
+	      num_fcntl_locks);
 
 	if (!ctx)
 		return 0;
@@ -450,7 +459,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 			err = -ENOSPC;
 			goto fail;
 		}
-		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
 		if (err)
 			goto fail;
 		++l;
@@ -461,7 +470,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 			err = -ENOSPC;
 			goto fail;
 		}
-		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
 		if (err)
 			goto fail;
 		++l;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 293b93182955..d95eb525519a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -12,6 +12,7 @@
 #include <linux/bits.h>
 #include <linux/ktime.h>
 #include <linux/bitmap.h>
+#include <linux/mnt_idmapping.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -411,6 +412,7 @@ static int parse_reply_info_readdir(void **p, void *end,
 				    u64 features)
 {
 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+	struct ceph_client *cl = req->r_mdsc->fsc->client;
 	u32 num, i = 0;
 	int err;
 
@@ -433,7 +435,7 @@ static int parse_reply_info_readdir(void **p, void *end,
 	BUG_ON(!info->dir_entries);
 	if ((unsigned long)(info->dir_entries + num) >
 	    (unsigned long)info->dir_entries + info->dir_buf_size) {
-		pr_err("dir contents are larger than expected\n");
+		pr_err_client(cl, "dir contents are larger than expected\n");
 		WARN_ON(1);
 		goto bad;
 	}
@@ -454,7 +456,7 @@ static int parse_reply_info_readdir(void **p, void *end,
 		ceph_decode_need(p, end, _name_len, bad);
 		_name = *p;
 		*p += _name_len;
-		dout("parsed dir dname '%.*s'\n", _name_len, _name);
+		doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name);
 
 		if (info->hash_order)
 			rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
@@ -514,8 +516,8 @@ static int parse_reply_info_readdir(void **p, void *end,
 		rde->is_nokey = false;
 		err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
 		if (err) {
-			pr_err("%s unable to decode %.*s, got %d\n", __func__,
-			       _name_len, _name, err);
+			pr_err_client(cl, "unable to decode %.*s, got %d\n",
+				      _name_len, _name, err);
 			goto out_bad;
 		}
 		rde->name = oname.name;
@@ -539,7 +541,7 @@ done:
 bad:
 	err = -EIO;
 out_bad:
-	pr_err("problem parsing dir contents %d\n", err);
+	pr_err_client(cl, "problem parsing dir contents %d\n", err);
 	return err;
 }
 
@@ -570,10 +572,11 @@ bad:
 static int ceph_parse_deleg_inos(void **p, void *end,
 				 struct ceph_mds_session *s)
 {
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
 	u32 sets;
 
 	ceph_decode_32_safe(p, end, sets, bad);
-	dout("got %u sets of delegated inodes\n", sets);
+	doutc(cl, "got %u sets of delegated inodes\n", sets);
 	while (sets--) {
 		u64 start, len;
 
@@ -582,8 +585,9 @@ static int ceph_parse_deleg_inos(void **p, void *end,
 
 		/* Don't accept a delegation of system inodes */
 		if (start < CEPH_INO_SYSTEM_BASE) {
-			pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
-					start, len);
+			pr_warn_ratelimited_client(cl,
+				"ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
+				start, len);
 			continue;
 		}
 		while (len--) {
@@ -591,10 +595,10 @@ static int ceph_parse_deleg_inos(void **p, void *end,
 					    DELEGATED_INO_AVAILABLE,
 					    GFP_KERNEL);
 			if (!err) {
-				dout("added delegated inode 0x%llx\n",
-				     start - 1);
+				doutc(cl, "added delegated inode 0x%llx\n", start - 1);
 			} else if (err == -EBUSY) {
-				pr_warn("MDS delegated inode 0x%llx more than once.\n",
+				pr_warn_client(cl,
+					"MDS delegated inode 0x%llx more than once.\n",
 					start - 1);
 			} else {
 				return err;
@@ -744,6 +748,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 			    struct ceph_mds_request *req, u64 features)
 {
 	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
 	void *p, *end;
 	u32 len;
 	int err;
@@ -783,7 +788,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 bad:
 	err = -EIO;
 out_bad:
-	pr_err("mds parse_reply err %d\n", err);
+	pr_err_client(cl, "mds parse_reply err %d\n", err);
 	ceph_msg_dump(msg);
 	return err;
 }
@@ -830,7 +835,8 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
  */
 int ceph_wait_on_conflict_unlink(struct dentry *dentry)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb);
+	struct ceph_client *cl = fsc->client;
 	struct dentry *pdentry = dentry->d_parent;
 	struct dentry *udentry, *found = NULL;
 	struct ceph_dentry_info *di;
@@ -855,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry)
 			goto next;
 
 		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
-			pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
-				__func__, dentry, dentry);
+			pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n",
+				       dentry, dentry);
 
 		if (!d_same_name(udentry, pdentry, &dname))
 			goto next;
@@ -872,8 +878,8 @@ next:
 	if (likely(!found))
 		return 0;
 
-	dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
-	     dentry, dentry, found, found);
+	doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry,
+	      found, found);
 
 	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
 			  TASK_KILLABLE);
@@ -957,6 +963,7 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc,
 static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 						 int mds)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_session *s;
 
 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
@@ -973,7 +980,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 		int newmax = 1 << get_count_order(mds + 1);
 		struct ceph_mds_session **sa;
 
-		dout("%s: realloc to %d\n", __func__, newmax);
+		doutc(cl, "realloc to %d\n", newmax);
 		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
 		if (!sa)
 			goto fail_realloc;
@@ -986,7 +993,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 		mdsc->max_sessions = newmax;
 	}
 
-	dout("%s: mds%d\n", __func__, mds);
+	doutc(cl, "mds%d\n", mds);
 	s->s_mdsc = mdsc;
 	s->s_mds = mds;
 	s->s_state = CEPH_MDS_SESSION_NEW;
@@ -1029,7 +1036,7 @@ fail_realloc:
 static void __unregister_session(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_session *s)
 {
-	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s);
 	BUG_ON(mdsc->sessions[s->s_mds] != s);
 	mdsc->sessions[s->s_mds] = NULL;
 	ceph_con_close(&s->s_con);
@@ -1116,6 +1123,8 @@ void ceph_mdsc_release_request(struct kref *kref)
 	kfree(req->r_path1);
 	kfree(req->r_path2);
 	put_cred(req->r_cred);
+	if (req->r_mnt_idmap)
+		mnt_idmap_put(req->r_mnt_idmap);
 	if (req->r_pagelist)
 		ceph_pagelist_release(req->r_pagelist);
 	kfree(req->r_fscrypt_auth);
@@ -1155,6 +1164,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_request *req,
 			       struct inode *dir)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int ret = 0;
 
 	req->r_tid = ++mdsc->last_tid;
@@ -1162,18 +1172,20 @@ static void __register_request(struct ceph_mds_client *mdsc,
 		ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
 					req->r_num_caps);
 		if (ret < 0) {
-			pr_err("__register_request %p "
-			       "failed to reserve caps: %d\n", req, ret);
+			pr_err_client(cl, "%p failed to reserve caps: %d\n",
+				      req, ret);
 			/* set req->r_err to fail early from __do_request */
 			req->r_err = ret;
 			return;
 		}
 	}
-	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	doutc(cl, "%p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
 	insert_request(&mdsc->request_tree, req);
 
 	req->r_cred = get_current_cred();
+	if (!req->r_mnt_idmap)
+		req->r_mnt_idmap = &nop_mnt_idmap;
 
 	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
 		mdsc->oldest_tid = req->r_tid;
@@ -1192,7 +1204,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 static void __unregister_request(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid);
 
 	/* Never leave an unregistered request on an unsafe list! */
 	list_del_init(&req->r_unsafe_item);
@@ -1278,6 +1290,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	u32 hash = req->r_direct_hash;
 	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+	struct ceph_client *cl = mdsc->fsc->client;
 
 	if (random)
 		*random = false;
@@ -1289,8 +1302,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	if (req->r_resend_mds >= 0 &&
 	    (__have_session(mdsc, req->r_resend_mds) ||
 	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
-		dout("%s using resend_mds mds%d\n", __func__,
-		     req->r_resend_mds);
+		doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds);
 		return req->r_resend_mds;
 	}
 
@@ -1307,7 +1319,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 			rcu_read_lock();
 			inode = get_nonsnap_parent(req->r_dentry);
 			rcu_read_unlock();
-			dout("%s using snapdir's parent %p\n", __func__, inode);
+			doutc(cl, "using snapdir's parent %p %llx.%llx\n",
+			      inode, ceph_vinop(inode));
 		}
 	} else if (req->r_dentry) {
 		/* ignore race with rename; old or new d_parent is okay */
@@ -1327,7 +1340,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
 			inode = get_nonsnap_parent(parent);
-			dout("%s using nonsnap parent %p\n", __func__, inode);
+			doutc(cl, "using nonsnap parent %p %llx.%llx\n",
+			      inode, ceph_vinop(inode));
 		} else {
 			/* dentry target */
 			inode = d_inode(req->r_dentry);
@@ -1343,10 +1357,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		rcu_read_unlock();
 	}
 
-	dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
-	     hash, mode);
 	if (!inode)
 		goto random;
+
+	doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode,
+	      ceph_vinop(inode), (int)is_hash, hash, mode);
 	ci = ceph_inode(inode);
 
 	if (is_hash && S_ISDIR(inode->i_mode)) {
@@ -1362,9 +1377,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				get_random_bytes(&r, 1);
 				r %= frag.ndist;
 				mds = frag.dist[r];
-				dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
-				     __func__, inode, ceph_vinop(inode),
-				     frag.frag, mds, (int)r, frag.ndist);
+				doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n",
+				      inode, ceph_vinop(inode), frag.frag,
+				      mds, (int)r, frag.ndist);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE &&
 				    !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
@@ -1377,9 +1392,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 			if (frag.mds >= 0) {
 				/* choose auth mds */
 				mds = frag.mds;
-				dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
-				     __func__, inode, ceph_vinop(inode),
-				     frag.frag, mds);
+				doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n",
+				      inode, ceph_vinop(inode), frag.frag, mds);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE) {
 					if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
@@ -1403,9 +1417,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		goto random;
 	}
 	mds = cap->session->s_mds;
-	dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
-	     inode, ceph_vinop(inode), mds,
-	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode,
+	      ceph_vinop(inode), mds,
+	      cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
 out:
 	iput(inode);
@@ -1416,7 +1430,7 @@ random:
 		*random = true;
 
 	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
-	dout("%s chose random mds%d\n", __func__, mds);
+	doutc(cl, "chose random mds%d\n", mds);
 	return mds;
 }
 
@@ -1529,6 +1543,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	int metadata_key_count = 0;
 	struct ceph_options *opt = mdsc->fsc->client->options;
 	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+	struct ceph_client *cl = mdsc->fsc->client;
 	size_t size, count;
 	void *p, *end;
 	int ret;
@@ -1567,7 +1582,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
 			   GFP_NOFS, false);
 	if (!msg) {
-		pr_err("ENOMEM creating session open msg\n");
+		pr_err_client(cl, "ENOMEM creating session open msg\n");
 		return ERR_PTR(-ENOMEM);
 	}
 	p = msg->front.iov_base;
@@ -1607,14 +1622,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 
 	ret = encode_supported_features(&p, end);
 	if (ret) {
-		pr_err("encode_supported_features failed!\n");
+		pr_err_client(cl, "encode_supported_features failed!\n");
 		ceph_msg_put(msg);
 		return ERR_PTR(ret);
 	}
 
 	ret = encode_metric_spec(&p, end);
 	if (ret) {
-		pr_err("encode_metric_spec failed!\n");
+		pr_err_client(cl, "encode_metric_spec failed!\n");
 		ceph_msg_put(msg);
 		return ERR_PTR(ret);
 	}
@@ -1642,8 +1657,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
 
 	/* wait for mds to go active? */
 	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
-	dout("open_session to mds%d (%s)\n", mds,
-	     ceph_mds_state_name(mstate));
+	doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds,
+	      ceph_mds_state_name(mstate));
 	session->s_state = CEPH_MDS_SESSION_OPENING;
 	session->s_renew_requested = jiffies;
 
@@ -1686,8 +1701,9 @@ struct ceph_mds_session *
 ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
 {
 	struct ceph_mds_session *session;
+	struct ceph_client *cl = mdsc->fsc->client;
 
-	dout("open_export_target_session to mds%d\n", target);
+	doutc(cl, "to mds%d\n", target);
 
 	mutex_lock(&mdsc->mutex);
 	session = __open_export_target_session(mdsc, target);
@@ -1702,13 +1718,14 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 	struct ceph_mds_info *mi;
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
+	struct ceph_client *cl = mdsc->fsc->client;
 
 	if (mds >= mdsc->mdsmap->possible_max_rank)
 		return;
 
 	mi = &mdsc->mdsmap->m_info[mds];
-	dout("open_export_target_sessions for mds%d (%d targets)\n",
-	     session->s_mds, mi->num_export_targets);
+	doutc(cl, "for mds%d (%d targets)\n", session->s_mds,
+	      mi->num_export_targets);
 
 	for (i = 0; i < mi->num_export_targets; i++) {
 		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
@@ -1731,11 +1748,13 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 static void detach_cap_releases(struct ceph_mds_session *session,
 				struct list_head *target)
 {
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
+
 	lockdep_assert_held(&session->s_cap_lock);
 
 	list_splice_init(&session->s_cap_releases, target);
 	session->s_num_cap_releases = 0;
-	dout("dispose_cap_releases mds%d\n", session->s_mds);
+	doutc(cl, "mds%d\n", session->s_mds);
 }
 
 static void dispose_cap_releases(struct ceph_mds_client *mdsc,
@@ -1753,16 +1772,17 @@ static void dispose_cap_releases(struct ceph_mds_client *mdsc,
 static void cleanup_session_requests(struct ceph_mds_client *mdsc,
 				     struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct rb_node *p;
 
-	dout("cleanup_session_requests mds%d\n", session->s_mds);
+	doutc(cl, "mds%d\n", session->s_mds);
 	mutex_lock(&mdsc->mutex);
 	while (!list_empty(&session->s_unsafe)) {
 		req = list_first_entry(&session->s_unsafe,
 				       struct ceph_mds_request, r_unsafe_item);
-		pr_warn_ratelimited(" dropping unsafe request %llu\n",
-				    req->r_tid);
+		pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n",
+					   req->r_tid);
 		if (req->r_target_inode)
 			mapping_set_error(req->r_target_inode->i_mapping, -EIO);
 		if (req->r_unsafe_dir)
@@ -1791,13 +1811,14 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 			      int (*cb)(struct inode *, int mds, void *),
 			      void *arg)
 {
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	struct list_head *p;
 	struct ceph_cap *cap;
 	struct inode *inode, *last_inode = NULL;
 	struct ceph_cap *old_cap = NULL;
 	int ret;
 
-	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	doutc(cl, "%p mds%d\n", session, session->s_mds);
 	spin_lock(&session->s_cap_lock);
 	p = session->s_caps.next;
 	while (p != &session->s_caps) {
@@ -1828,8 +1849,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 		spin_lock(&session->s_cap_lock);
 		p = p->next;
 		if (!cap->ci) {
-			dout("iterate_session_caps  finishing cap %p removal\n",
-			     cap);
+			doutc(cl, "finishing cap %p removal\n", cap);
 			BUG_ON(cap->session != session);
 			cap->session = NULL;
 			list_del_init(&cap->session_caps);
@@ -1858,6 +1878,7 @@ out:
 static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	bool invalidate = false;
 	struct ceph_cap *cap;
 	int iputs = 0;
@@ -1865,8 +1886,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
 	spin_lock(&ci->i_ceph_lock);
 	cap = __get_cap_for_mds(ci, mds);
 	if (cap) {
-		dout(" removing cap %p, ci is %p, inode is %p\n",
-		     cap, ci, &ci->netfs.inode);
+		doutc(cl, " removing cap %p, ci is %p, inode is %p\n",
+		      cap, ci, &ci->netfs.inode);
 
 		iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
 	}
@@ -1890,7 +1911,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
 	struct super_block *sb = fsc->sb;
 	LIST_HEAD(dispose);
 
-	dout("remove_session_caps on %p\n", session);
+	doutc(fsc->client, "on %p\n", session);
 	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
 
 	wake_up_all(&fsc->mdsc->cap_flushing_wq);
@@ -1971,7 +1992,9 @@ static int wake_up_session_cb(struct inode *inode, int mds, void *arg)
 
 static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
 {
-	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
+
+	doutc(cl, "session %p mds%d\n", session, session->s_mds);
 	ceph_iterate_session_caps(session, wake_up_session_cb,
 				  (void *)(unsigned long)ev);
 }
@@ -1985,25 +2008,26 @@ static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
 static int send_renew_caps(struct ceph_mds_client *mdsc,
 			   struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	int state;
 
 	if (time_after_eq(jiffies, session->s_cap_ttl) &&
 	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
-		pr_info("mds%d caps stale\n", session->s_mds);
+		pr_info_client(cl, "mds%d caps stale\n", session->s_mds);
 	session->s_renew_requested = jiffies;
 
 	/* do not try to renew caps until a recovering mds has reconnected
 	 * with its clients. */
 	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
 	if (state < CEPH_MDS_STATE_RECONNECT) {
-		dout("send_renew_caps ignoring mds%d (%s)\n",
-		     session->s_mds, ceph_mds_state_name(state));
+		doutc(cl, "ignoring mds%d (%s)\n", session->s_mds,
+		      ceph_mds_state_name(state));
 		return 0;
 	}
 
-	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
-		ceph_mds_state_name(state));
+	doutc(cl, "to mds%d (%s)\n", session->s_mds,
+	      ceph_mds_state_name(state));
 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				      ++session->s_renew_seq);
 	if (!msg)
@@ -2015,10 +2039,11 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_session *session, u64 seq)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 
-	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
-	     session->s_mds, ceph_session_state_name(session->s_state), seq);
+	doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds,
+	      ceph_session_state_name(session->s_state), seq);
 	msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
 	if (!msg)
 		return -ENOMEM;
@@ -2035,6 +2060,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
 static void renewed_caps(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session, int is_renew)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int was_stale;
 	int wake = 0;
 
@@ -2046,15 +2072,17 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
 
 	if (was_stale) {
 		if (time_before(jiffies, session->s_cap_ttl)) {
-			pr_info("mds%d caps renewed\n", session->s_mds);
+			pr_info_client(cl, "mds%d caps renewed\n",
+				       session->s_mds);
 			wake = 1;
 		} else {
-			pr_info("mds%d caps still stale\n", session->s_mds);
+			pr_info_client(cl, "mds%d caps still stale\n",
+				       session->s_mds);
 		}
 	}
-	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
-	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
-	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds,
+	      session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	      time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
 	spin_unlock(&session->s_cap_lock);
 
 	if (wake)
@@ -2066,11 +2094,11 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
  */
 static int request_close_session(struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	struct ceph_msg *msg;
 
-	dout("request_close_session mds%d state %s seq %lld\n",
-	     session->s_mds, ceph_session_state_name(session->s_state),
-	     session->s_seq);
+	doutc(cl, "mds%d state %s seq %lld\n", session->s_mds,
+	      ceph_session_state_name(session->s_state), session->s_seq);
 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
 				      session->s_seq);
 	if (!msg)
@@ -2126,6 +2154,8 @@ out:
  */
 static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
@@ -2145,9 +2175,10 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 	wanted = __ceph_caps_file_wanted(ci);
 	oissued = __ceph_caps_issued_other(ci, cap);
 
-	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
-	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
-	     ceph_cap_string(used), ceph_cap_string(wanted));
+	doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n",
+	      inode, ceph_vinop(inode), cap, ceph_cap_string(mine),
+	      ceph_cap_string(oissued), ceph_cap_string(used),
+	      ceph_cap_string(wanted));
 	if (cap == ci->i_auth_cap) {
 		if (ci->i_dirty_caps || ci->i_flushing_caps ||
 		    !list_empty(&ci->i_cap_snaps))
@@ -2173,7 +2204,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
-		ceph_remove_cap(cap, true);
+		ceph_remove_cap(mdsc, cap, true);
 		(*remaining)--;
 	} else {
 		struct dentry *dentry;
@@ -2187,8 +2218,8 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
 			count = atomic_read(&inode->i_count);
 			if (count == 1)
 				(*remaining)--;
-			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
-			     inode, cap, count);
+			doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
+			      inode, ceph_vinop(inode), cap, count);
 		} else {
 			dput(dentry);
 		}
@@ -2207,17 +2238,18 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
 		   struct ceph_mds_session *session,
 		   int max_caps)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int trim_caps = session->s_nr_caps - max_caps;
 
-	dout("trim_caps mds%d start: %d / %d, trim %d\n",
-	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds,
+	      session->s_nr_caps, max_caps, trim_caps);
 	if (trim_caps > 0) {
 		int remaining = trim_caps;
 
 		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
-		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
-		     session->s_mds, session->s_nr_caps, max_caps,
-			trim_caps - remaining);
+		doutc(cl, "mds%d done: %d / %d, trimmed %d\n",
+		      session->s_mds, session->s_nr_caps, max_caps,
+		      trim_caps - remaining);
 	}
 
 	ceph_flush_cap_releases(mdsc, session);
@@ -2227,6 +2259,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
 static int check_caps_flush(struct ceph_mds_client *mdsc,
 			    u64 want_flush_tid)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int ret = 1;
 
 	spin_lock(&mdsc->cap_dirty_lock);
@@ -2235,8 +2268,8 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
 			list_first_entry(&mdsc->cap_flush_list,
 					 struct ceph_cap_flush, g_list);
 		if (cf->tid <= want_flush_tid) {
-			dout("check_caps_flush still flushing tid "
-			     "%llu <= %llu\n", cf->tid, want_flush_tid);
+			doutc(cl, "still flushing tid %llu <= %llu\n",
+			      cf->tid, want_flush_tid);
 			ret = 0;
 		}
 	}
@@ -2252,12 +2285,14 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
 static void wait_caps_flush(struct ceph_mds_client *mdsc,
 			    u64 want_flush_tid)
 {
-	dout("check_caps_flush want %llu\n", want_flush_tid);
+	struct ceph_client *cl = mdsc->fsc->client;
+
+	doutc(cl, "want %llu\n", want_flush_tid);
 
 	wait_event(mdsc->cap_flushing_wq,
 		   check_caps_flush(mdsc, want_flush_tid));
 
-	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
+	doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
 }
 
 /*
@@ -2266,6 +2301,7 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
 static void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
 				   struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg = NULL;
 	struct ceph_mds_cap_release *head;
 	struct ceph_mds_cap_item *item;
@@ -2324,7 +2360,7 @@ again:
 			msg->front.iov_len += sizeof(*cap_barrier);
 
 			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-			dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+			doutc(cl, "mds%d %p\n", session->s_mds, msg);
 			ceph_con_send(&session->s_con, msg);
 			msg = NULL;
 		}
@@ -2344,13 +2380,13 @@ again:
 		msg->front.iov_len += sizeof(*cap_barrier);
 
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		doutc(cl, "mds%d %p\n", session->s_mds, msg);
 		ceph_con_send(&session->s_con, msg);
 	}
 	return;
 out_err:
-	pr_err("send_cap_releases mds%d, failed to allocate message\n",
-		session->s_mds);
+	pr_err_client(cl, "mds%d, failed to allocate message\n",
+		      session->s_mds);
 	spin_lock(&session->s_cap_lock);
 	list_splice(&tmp_list, &session->s_cap_releases);
 	session->s_num_cap_releases += num_cap_releases;
@@ -2373,16 +2409,17 @@ static void ceph_cap_release_work(struct work_struct *work)
 void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
 		             struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	if (mdsc->stopping)
 		return;
 
 	ceph_get_mds_session(session);
 	if (queue_work(mdsc->fsc->cap_wq,
 		       &session->s_cap_release_work)) {
-		dout("cap release work queued\n");
+		doutc(cl, "cap release work queued\n");
 	} else {
 		ceph_put_mds_session(session);
-		dout("failed to queue cap release work\n");
+		doutc(cl, "failed to queue cap release work\n");
 	}
 }
 
@@ -2410,13 +2447,14 @@ static void ceph_cap_reclaim_work(struct work_struct *work)
 
 void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	if (mdsc->stopping)
 		return;
 
         if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
-                dout("caps reclaim work queued\n");
+                doutc(cl, "caps reclaim work queued\n");
         } else {
-                dout("failed to queue caps release work\n");
+                doutc(cl, "failed to queue caps release work\n");
         }
 }
 
@@ -2588,6 +2626,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
 
 /**
  * ceph_mdsc_build_path - build a path string to a given dentry
+ * @mdsc: mds client
  * @dentry: dentry to which path should be built
  * @plen: returned length of string
  * @pbase: returned base inode number
@@ -2607,9 +2646,10 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
  * Encode hidden .snap dirs as a double /, i.e.
  *   foo/.snap/bar -> foo//bar
  */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
-			   int for_wire)
+char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+			   int *plen, u64 *pbase, int for_wire)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct dentry *cur;
 	struct inode *inode;
 	char *path;
@@ -2635,8 +2675,7 @@ retry:
 		spin_lock(&cur->d_lock);
 		inode = d_inode(cur);
 		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-			dout("build_path path+%d: %p SNAPDIR\n",
-			     pos, cur);
+			doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur);
 			spin_unlock(&cur->d_lock);
 			parent = dget_parent(cur);
 		} else if (for_wire && inode && dentry != cur &&
@@ -2714,21 +2753,21 @@ retry:
 		 * A rename didn't occur, but somehow we didn't end up where
 		 * we thought we would. Throw a warning and try again.
 		 */
-		pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
-			pos);
+		pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n",
+			       pos);
 		goto retry;
 	}
 
 	*pbase = base;
 	*plen = PATH_MAX - 1 - pos;
-	dout("build_path on %p %d built %llx '%.*s'\n",
-	     dentry, d_count(dentry), base, *plen, path + pos);
+	doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
+	      base, *plen, path + pos);
 	return path + pos;
 }
 
-static int build_dentry_path(struct dentry *dentry, struct inode *dir,
-			     const char **ppath, int *ppathlen, u64 *pino,
-			     bool *pfreepath, bool parent_locked)
+static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
+			     struct inode *dir, const char **ppath, int *ppathlen,
+			     u64 *pino, bool *pfreepath, bool parent_locked)
 {
 	char *path;
 
@@ -2744,7 +2783,7 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 		return 0;
 	}
 	rcu_read_unlock();
-	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
@@ -2756,6 +2795,7 @@ static int build_inode_path(struct inode *inode,
 			    const char **ppath, int *ppathlen, u64 *pino,
 			    bool *pfreepath)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct dentry *dentry;
 	char *path;
 
@@ -2765,7 +2805,7 @@ static int build_inode_path(struct inode *inode,
 		return 0;
 	}
 	dentry = d_find_alias(inode);
-	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
 	dput(dentry);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
@@ -2778,27 +2818,28 @@ static int build_inode_path(struct inode *inode,
  * request arguments may be specified via an inode *, a dentry *, or
  * an explicit ino+path.
  */
-static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
-				  struct inode *rdiri, const char *rpath,
-				  u64 rino, const char **ppath, int *pathlen,
-				  u64 *ino, bool *freepath, bool parent_locked)
+static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
+				 struct dentry *rdentry, struct inode *rdiri,
+				 const char *rpath, u64 rino, const char **ppath,
+				 int *pathlen, u64 *ino, bool *freepath,
+				 bool parent_locked)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int r = 0;
 
 	if (rinode) {
 		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
-		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
-		     ceph_snap(rinode));
+		doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		      ceph_snap(rinode));
 	} else if (rdentry) {
-		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
+		r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
 					freepath, parent_locked);
-		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
-		     *ppath);
+		doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
 	} else if (rpath || rino) {
 		*ino = rino;
 		*ppath = rpath;
 		*pathlen = rpath ? strlen(rpath) : 0;
-		dout(" path %.*s\n", *pathlen, rpath);
+		doutc(cl, " path %.*s\n", *pathlen, rpath);
 	}
 
 	return r;
@@ -2840,6 +2881,17 @@ static void encode_mclientrequest_tail(void **p,
 	}
 }
 
+static inline u16 mds_supported_head_version(struct ceph_mds_session *session)
+{
+	if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features))
+		return 1;
+
+	if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features))
+		return 2;
+
+	return CEPH_MDS_REQUEST_HEAD_VERSION;
+}
+
 static struct ceph_mds_request_head_legacy *
 find_legacy_request_head(void *p, u64 features)
 {
@@ -2861,6 +2913,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	struct ceph_mds_request_head_legacy *lhead;
 	const char *path1 = NULL;
@@ -2874,10 +2927,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	void *p, *end;
 	int ret;
 	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
-	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
-				     &session->s_features);
+	u16 request_head_version = mds_supported_head_version(session);
+	kuid_t caller_fsuid = req->r_cred->fsuid;
+	kgid_t caller_fsgid = req->r_cred->fsgid;
 
-	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
 			      &path1, &pathlen1, &ino1, &freepath1,
 			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
@@ -2891,7 +2945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (req->r_old_dentry &&
 	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
 		old_dentry = req->r_old_dentry;
-	ret = set_request_path_attr(NULL, old_dentry,
+	ret = set_request_path_attr(mdsc, NULL, old_dentry,
 			      req->r_old_dentry_dir,
 			      req->r_path2, req->r_ino2.ino,
 			      &path2, &pathlen2, &ino2, &freepath2, true);
@@ -2916,8 +2970,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	 */
 	if (legacy)
 		len = sizeof(struct ceph_mds_request_head_legacy);
-	else if (old_version)
+	else if (request_head_version == 1)
 		len = sizeof(struct ceph_mds_request_head_old);
+	else if (request_head_version == 2)
+		len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
 	else
 		len = sizeof(struct ceph_mds_request_head);
 
@@ -2967,6 +3023,30 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	lhead = find_legacy_request_head(msg->front.iov_base,
 					 session->s_con.peer_features);
 
+	if ((req->r_mnt_idmap != &nop_mnt_idmap) &&
+	    !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) {
+		WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op));
+
+		if (enable_unsafe_idmap) {
+			pr_warn_once_client(cl,
+				"idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+				" is not supported by MDS. UID/GID-based restrictions may"
+				" not work properly.\n");
+
+			caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+						   VFSUIDT_INIT(req->r_cred->fsuid));
+			caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+						   VFSGIDT_INIT(req->r_cred->fsgid));
+		} else {
+			pr_err_ratelimited_client(cl,
+				"idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID"
+				" is not supported by MDS. Fail request with -EIO.\n");
+
+			ret = -EIO;
+			goto out_err;
+		}
+	}
+
 	/*
 	 * The ceph_mds_request_head_legacy didn't contain a version field, and
 	 * one was added when we moved the message version from 3->4.
@@ -2974,17 +3054,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	if (legacy) {
 		msg->hdr.version = cpu_to_le16(3);
 		p = msg->front.iov_base + sizeof(*lhead);
-	} else if (old_version) {
+	} else if (request_head_version == 1) {
 		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
 
 		msg->hdr.version = cpu_to_le16(4);
 		ohead->version = cpu_to_le16(1);
 		p = msg->front.iov_base + sizeof(*ohead);
+	} else if (request_head_version == 2) {
+		struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(6);
+		nhead->version = cpu_to_le16(2);
+
+		p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd);
 	} else {
 		struct ceph_mds_request_head *nhead = msg->front.iov_base;
+		kuid_t owner_fsuid;
+		kgid_t owner_fsgid;
 
 		msg->hdr.version = cpu_to_le16(6);
 		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head));
+
+		if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) {
+			owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns,
+						VFSUIDT_INIT(req->r_cred->fsuid));
+			owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns,
+						VFSGIDT_INIT(req->r_cred->fsgid));
+			nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid));
+			nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid));
+		} else {
+			nhead->owner_uid = cpu_to_le32(-1);
+			nhead->owner_gid = cpu_to_le32(-1);
+		}
+
 		p = msg->front.iov_base + sizeof(*nhead);
 	}
 
@@ -2993,9 +3096,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
 	lhead->op = cpu_to_le32(req->r_op);
 	lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
-						  req->r_cred->fsuid));
+						  caller_fsuid));
 	lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
-						  req->r_cred->fsgid));
+						  caller_fsgid));
 	lhead->ino = cpu_to_le64(req->r_deleg_ino);
 	lhead->args = req->r_args;
 
@@ -3099,6 +3202,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request_head_legacy *lhead;
 	struct ceph_mds_request_head *nhead;
 	struct ceph_msg *msg;
@@ -3117,8 +3221,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 	       old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
 	       if ((old_version && req->r_attempts >= old_max_retry) ||
 		   ((uint32_t)req->r_attempts >= U32_MAX)) {
-			pr_warn_ratelimited("%s request tid %llu seq overflow\n",
-					    __func__, req->r_tid);
+			pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
+						   req->r_tid);
 			return -EMULTIHOP;
 	       }
 	}
@@ -3133,8 +3237,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		else
 			req->r_sent_on_mseq = -1;
 	}
-	dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
-	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+	doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid,
+	      ceph_mds_op_name(req->r_op), req->r_attempts);
 
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
 		void *p;
@@ -3202,7 +3306,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
 	}
 
-	dout(" r_parent = %p\n", req->r_parent);
+	doutc(cl, " r_parent = %p\n", req->r_parent);
 	return 0;
 }
 
@@ -3230,6 +3334,7 @@ static int __send_request(struct ceph_mds_session *session,
 static void __do_request(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_session *session = NULL;
 	int mds = -1;
 	int err = 0;
@@ -3242,29 +3347,29 @@ static void __do_request(struct ceph_mds_client *mdsc,
 	}
 
 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
-		dout("do_request metadata corrupted\n");
+		doutc(cl, "metadata corrupted\n");
 		err = -EIO;
 		goto finish;
 	}
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
-		dout("do_request timed out\n");
+		doutc(cl, "timed out\n");
 		err = -ETIMEDOUT;
 		goto finish;
 	}
 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-		dout("do_request forced umount\n");
+		doutc(cl, "forced umount\n");
 		err = -EIO;
 		goto finish;
 	}
 	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
 		if (mdsc->mdsmap_err) {
 			err = mdsc->mdsmap_err;
-			dout("do_request mdsmap err %d\n", err);
+			doutc(cl, "mdsmap err %d\n", err);
 			goto finish;
 		}
 		if (mdsc->mdsmap->m_epoch == 0) {
-			dout("do_request no mdsmap, waiting for map\n");
+			doutc(cl, "no mdsmap, waiting for map\n");
 			list_add(&req->r_wait, &mdsc->waiting_for_map);
 			return;
 		}
@@ -3285,7 +3390,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
 			err = -EJUKEBOX;
 			goto finish;
 		}
-		dout("do_request no mds or not active, waiting for map\n");
+		doutc(cl, "no mds or not active, waiting for map\n");
 		list_add(&req->r_wait, &mdsc->waiting_for_map);
 		return;
 	}
@@ -3301,8 +3406,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
 	}
 	req->r_session = ceph_get_mds_session(session);
 
-	dout("do_request mds%d session %p state %s\n", mds, session,
-	     ceph_session_state_name(session->s_state));
+	doutc(cl, "mds%d session %p state %s\n", mds, session,
+	      ceph_session_state_name(session->s_state));
 
 	/*
 	 * The old ceph will crash the MDSs when see unknown OPs
@@ -3393,8 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
 		spin_lock(&ci->i_ceph_lock);
 		cap = ci->i_auth_cap;
 		if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
-			dout("do_request session changed for auth cap %d -> %d\n",
-			     cap->session->s_mds, session->s_mds);
+			doutc(cl, "session changed for auth cap %d -> %d\n",
+			      cap->session->s_mds, session->s_mds);
 
 			/* Remove the auth cap from old session */
 			spin_lock(&cap->session->s_cap_lock);
@@ -3421,7 +3526,7 @@ out_session:
 	ceph_put_mds_session(session);
 finish:
 	if (err) {
-		dout("__do_request early error %d\n", err);
+		doutc(cl, "early error %d\n", err);
 		req->r_err = err;
 		complete_request(mdsc, req);
 		__unregister_request(mdsc, req);
@@ -3435,6 +3540,7 @@ finish:
 static void __wake_requests(struct ceph_mds_client *mdsc,
 			    struct list_head *head)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	LIST_HEAD(tmp_list);
 
@@ -3444,7 +3550,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 		req = list_entry(tmp_list.next,
 				 struct ceph_mds_request, r_wait);
 		list_del_init(&req->r_wait);
-		dout(" wake request %p tid %llu\n", req, req->r_tid);
+		doutc(cl, " wake request %p tid %llu\n", req,
+		      req->r_tid);
 		__do_request(mdsc, req);
 	}
 }
@@ -3455,10 +3562,11 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
  */
 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct rb_node *p = rb_first(&mdsc->request_tree);
 
-	dout("kick_requests mds%d\n", mds);
+	doutc(cl, "kick_requests mds%d\n", mds);
 	while (p) {
 		req = rb_entry(p, struct ceph_mds_request, r_node);
 		p = rb_next(p);
@@ -3468,7 +3576,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 			continue; /* only new requests */
 		if (req->r_session &&
 		    req->r_session->s_mds == mds) {
-			dout(" kicking tid %llu\n", req->r_tid);
+			doutc(cl, " kicking tid %llu\n", req->r_tid);
 			list_del_init(&req->r_wait);
 			__do_request(mdsc, req);
 		}
@@ -3478,6 +3586,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 			      struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int err = 0;
 
 	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
@@ -3499,8 +3608,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 	if (req->r_inode) {
 		err = ceph_wait_on_async_create(req->r_inode);
 		if (err) {
-			dout("%s: wait for async create returned: %d\n",
-			     __func__, err);
+			doutc(cl, "wait for async create returned: %d\n", err);
 			return err;
 		}
 	}
@@ -3508,13 +3616,12 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 	if (!err && req->r_old_inode) {
 		err = ceph_wait_on_async_create(req->r_old_inode);
 		if (err) {
-			dout("%s: wait for async create returned: %d\n",
-			     __func__, err);
+			doutc(cl, "wait for async create returned: %d\n", err);
 			return err;
 		}
 	}
 
-	dout("submit_request on %p for inode %p\n", req, dir);
+	doutc(cl, "submit_request on %p for inode %p\n", req, dir);
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
 	__do_request(mdsc, req);
@@ -3527,10 +3634,11 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
 			   struct ceph_mds_request *req,
 			   ceph_mds_request_wait_callback_t wait_func)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int err;
 
 	/* wait */
-	dout("do_request waiting\n");
+	doutc(cl, "do_request waiting\n");
 	if (wait_func) {
 		err = wait_func(mdsc, req);
 	} else {
@@ -3544,14 +3652,14 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
 		else
 			err = timeleft;  /* killed */
 	}
-	dout("do_request waited, got %d\n", err);
+	doutc(cl, "do_request waited, got %d\n", err);
 	mutex_lock(&mdsc->mutex);
 
 	/* only abort if we didn't race with a real reply */
 	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 		err = le32_to_cpu(req->r_reply_info.head->result);
 	} else if (err < 0) {
-		dout("aborted request %lld with %d\n", req->r_tid, err);
+		doutc(cl, "aborted request %lld with %d\n", req->r_tid, err);
 
 		/*
 		 * ensure we aren't running concurrently with
@@ -3582,15 +3690,16 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 			 struct inode *dir,
 			 struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	int err;
 
-	dout("do_request on %p\n", req);
+	doutc(cl, "do_request on %p\n", req);
 
 	/* issue */
 	err = ceph_mdsc_submit_request(mdsc, dir, req);
 	if (!err)
 		err = ceph_mdsc_wait_request(mdsc, req, NULL);
-	dout("do_request %p done, result %d\n", req, err);
+	doutc(cl, "do_request %p done, result %d\n", req, err);
 	return err;
 }
 
@@ -3602,8 +3711,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
 	struct inode *dir = req->r_parent;
 	struct inode *old_dir = req->r_old_dentry_dir;
+	struct ceph_client *cl = req->r_mdsc->fsc->client;
 
-	dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
+	doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n",
+	      dir, old_dir);
 
 	ceph_dir_clear_complete(dir);
 	if (old_dir)
@@ -3624,6 +3735,7 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	struct ceph_mds_reply_head *head = msg->front.iov_base;
 	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
@@ -3634,7 +3746,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	bool close_sessions = false;
 
 	if (msg->front.iov_len < sizeof(*head)) {
-		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		pr_err_client(cl, "got corrupt (short) reply\n");
 		ceph_msg_dump(msg);
 		return;
 	}
@@ -3644,17 +3756,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&mdsc->mutex);
 	req = lookup_get_request(mdsc, tid);
 	if (!req) {
-		dout("handle_reply on unknown tid %llu\n", tid);
+		doutc(cl, "on unknown tid %llu\n", tid);
 		mutex_unlock(&mdsc->mutex);
 		return;
 	}
-	dout("handle_reply %p\n", req);
+	doutc(cl, "handle_reply %p\n", req);
 
 	/* correct session? */
 	if (req->r_session != session) {
-		pr_err("mdsc_handle_reply got %llu on session mds%d"
-		       " not mds%d\n", tid, session->s_mds,
-		       req->r_session ? req->r_session->s_mds : -1);
+		pr_err_client(cl, "got %llu on session mds%d not mds%d\n",
+			      tid, session->s_mds,
+			      req->r_session ? req->r_session->s_mds : -1);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
@@ -3662,14 +3774,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	/* dup? */
 	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
 	    (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
-		pr_warn("got a dup %s reply on %llu from mds%d\n",
-			   head->safe ? "safe" : "unsafe", tid, mds);
+		pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n",
+			       head->safe ? "safe" : "unsafe", tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
 	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
-		pr_warn("got unsafe after safe on %llu from mds%d\n",
-			   tid, mds);
+		pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n",
+			       tid, mds);
 		mutex_unlock(&mdsc->mutex);
 		goto out;
 	}
@@ -3692,7 +3804,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			 * response.  And even if it did, there is nothing
 			 * useful we could do with a revised return value.
 			 */
-			dout("got safe reply %llu, mds%d\n", tid, mds);
+			doutc(cl, "got safe reply %llu, mds%d\n", tid, mds);
 
 			mutex_unlock(&mdsc->mutex);
 			goto out;
@@ -3702,7 +3814,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
 	}
 
-	dout("handle_reply tid %lld result %d\n", tid, result);
+	doutc(cl, "tid %lld result %d\n", tid, result);
 	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
 		err = parse_reply_info(session, msg, req, (u64)-1);
 	else
@@ -3742,7 +3854,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
-		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
+		pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n",
+			      mds, tid);
 		ceph_msg_dump(msg);
 		goto out_err;
 	}
@@ -3806,7 +3919,7 @@ out_err:
 			set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
 		}
 	} else {
-		dout("reply arrived after request %lld was aborted\n", tid);
+		doutc(cl, "reply arrived after request %lld was aborted\n", tid);
 	}
 	mutex_unlock(&mdsc->mutex);
 
@@ -3835,6 +3948,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 			   struct ceph_mds_session *session,
 			   struct ceph_msg *msg)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 next_mds;
@@ -3852,12 +3966,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		mutex_unlock(&mdsc->mutex);
-		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
+		doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds);
 		return;  /* dup reply? */
 	}
 
 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
-		dout("forward tid %llu aborted, unregistering\n", tid);
+		doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
 		__unregister_request(mdsc, req);
 	} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
 		/*
@@ -3873,10 +3987,11 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
 		mutex_unlock(&req->r_fill_mutex);
 		aborted = true;
-		pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
+		pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n",
+					   tid);
 	} else {
 		/* resend. forward race not possible; mds would drop */
-		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+		doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
 		BUG_ON(req->r_err);
 		BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
 		req->r_attempts = 0;
@@ -3894,7 +4009,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	return;
 
 bad:
-	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+	pr_err_client(cl, "decode error err=%d\n", err);
 	ceph_msg_dump(msg);
 }
 
@@ -3933,6 +4048,7 @@ static void handle_session(struct ceph_mds_session *session,
 			   struct ceph_msg *msg)
 {
 	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	int mds = session->s_mds;
 	int msg_version = le16_to_cpu(msg->hdr.version);
 	void *p = msg->front.iov_base;
@@ -3980,7 +4096,8 @@ static void handle_session(struct ceph_mds_session *session,
 		/* version >= 5, flags   */
 		ceph_decode_32_safe(&p, end, flags, bad);
 		if (flags & CEPH_SESSION_BLOCKLISTED) {
-			pr_warn("mds%d session blocklisted\n", session->s_mds);
+			pr_warn_client(cl, "mds%d session blocklisted\n",
+				       session->s_mds);
 			blocklisted = true;
 		}
 	}
@@ -3996,22 +4113,24 @@ static void handle_session(struct ceph_mds_session *session,
 
 	mutex_lock(&session->s_mutex);
 
-	dout("handle_session mds%d %s %p state %s seq %llu\n",
-	     mds, ceph_session_op_name(op), session,
-	     ceph_session_state_name(session->s_state), seq);
+	doutc(cl, "mds%d %s %p state %s seq %llu\n", mds,
+	      ceph_session_op_name(op), session,
+	      ceph_session_state_name(session->s_state), seq);
 
 	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
 		session->s_state = CEPH_MDS_SESSION_OPEN;
-		pr_info("mds%d came back\n", session->s_mds);
+		pr_info_client(cl, "mds%d came back\n", session->s_mds);
 	}
 
 	switch (op) {
 	case CEPH_SESSION_OPEN:
 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
-			pr_info("mds%d reconnect success\n", session->s_mds);
+			pr_info_client(cl, "mds%d reconnect success\n",
+				       session->s_mds);
 
 		if (session->s_state == CEPH_MDS_SESSION_OPEN) {
-			pr_notice("mds%d is already opened\n", session->s_mds);
+			pr_notice_client(cl, "mds%d is already opened\n",
+					 session->s_mds);
 		} else {
 			session->s_state = CEPH_MDS_SESSION_OPEN;
 			session->s_features = features;
@@ -4041,7 +4160,8 @@ static void handle_session(struct ceph_mds_session *session,
 
 	case CEPH_SESSION_CLOSE:
 		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
-			pr_info("mds%d reconnect denied\n", session->s_mds);
+			pr_info_client(cl, "mds%d reconnect denied\n",
+				       session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_CLOSED;
 		cleanup_session_requests(mdsc, session);
 		remove_session_caps(session);
@@ -4050,8 +4170,8 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_STALE:
-		pr_info("mds%d caps went stale, renewing\n",
-			session->s_mds);
+		pr_info_client(cl, "mds%d caps went stale, renewing\n",
+			       session->s_mds);
 		atomic_inc(&session->s_cap_gen);
 		session->s_cap_ttl = jiffies - 1;
 		send_renew_caps(mdsc, session);
@@ -4072,7 +4192,7 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_FORCE_RO:
-		dout("force_session_readonly %p\n", session);
+		doutc(cl, "force_session_readonly %p\n", session);
 		spin_lock(&session->s_cap_lock);
 		session->s_readonly = true;
 		spin_unlock(&session->s_cap_lock);
@@ -4081,7 +4201,8 @@ static void handle_session(struct ceph_mds_session *session,
 
 	case CEPH_SESSION_REJECT:
 		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
-		pr_info("mds%d rejected session\n", session->s_mds);
+		pr_info_client(cl, "mds%d rejected session\n",
+			       session->s_mds);
 		session->s_state = CEPH_MDS_SESSION_REJECTED;
 		cleanup_session_requests(mdsc, session);
 		remove_session_caps(session);
@@ -4091,7 +4212,7 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	default:
-		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		pr_err_client(cl, "bad op %d mds%d\n", op, mds);
 		WARN_ON(1);
 	}
 
@@ -4108,30 +4229,32 @@ static void handle_session(struct ceph_mds_session *session,
 	return;
 
 bad:
-	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
-	       (int)msg->front.iov_len);
+	pr_err_client(cl, "corrupt message mds%d len %d\n", mds,
+		      (int)msg->front.iov_len);
 	ceph_msg_dump(msg);
 	return;
 }
 
 void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = req->r_mdsc->fsc->client;
 	int dcaps;
 
 	dcaps = xchg(&req->r_dir_caps, 0);
 	if (dcaps) {
-		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+		doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
 		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
 	}
 }
 
 void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
 {
+	struct ceph_client *cl = req->r_mdsc->fsc->client;
 	int dcaps;
 
 	dcaps = xchg(&req->r_dir_caps, 0);
 	if (dcaps) {
-		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+		doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
 		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
 						dcaps);
 	}
@@ -4146,7 +4269,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 	struct ceph_mds_request *req, *nreq;
 	struct rb_node *p;
 
-	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+	doutc(mdsc->fsc->client, "mds%d\n", session->s_mds);
 
 	mutex_lock(&mdsc->mutex);
 	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
@@ -4290,6 +4413,8 @@ out_unlock:
  */
 static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	union {
 		struct ceph_mds_cap_reconnect v2;
 		struct ceph_mds_cap_reconnect_v1 v1;
@@ -4307,7 +4432,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 	dentry = d_find_primary(inode);
 	if (dentry) {
 		/* set pathbase to parent dir when msg_version >= 2 */
-		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+		path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
 					    recon_state->msg_version >= 2);
 		dput(dentry);
 		if (IS_ERR(path)) {
@@ -4326,9 +4451,9 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 		err = 0;
 		goto out_err;
 	}
-	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
-	     inode, ceph_vinop(inode), cap, cap->cap_id,
-	     ceph_cap_string(cap->issued));
+	doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode,
+	      ceph_vinop(inode), cap, cap->cap_id,
+	      ceph_cap_string(cap->issued));
 
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
@@ -4353,12 +4478,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
+		struct timespec64 ts;
+
 		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
 		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v1.issued = cpu_to_le32(cap->issued);
 		rec.v1.size = cpu_to_le64(i_size_read(inode));
-		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
-		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
+		ts = inode_get_mtime(inode);
+		ceph_encode_timespec64(&rec.v1.mtime, &ts);
+		ts = inode_get_atime(inode);
+		ceph_encode_timespec64(&rec.v1.atime, &ts);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 		rec.v1.pathbase = cpu_to_le64(pathbase);
 	}
@@ -4478,6 +4607,7 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
 {
 	struct rb_node *p;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
+	struct ceph_client *cl = mdsc->fsc->client;
 	int err = 0;
 
 	if (recon_state->msg_version >= 4) {
@@ -4516,8 +4646,8 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
 			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
 		}
 
-		dout(" adding snap realm %llx seq %lld parent %llx\n",
-		     realm->ino, realm->seq, realm->parent_ino);
+		doutc(cl, " adding snap realm %llx seq %lld parent %llx\n",
+		      realm->ino, realm->seq, realm->parent_ino);
 		sr_rec.ino = cpu_to_le64(realm->ino);
 		sr_rec.seq = cpu_to_le64(realm->seq);
 		sr_rec.parent = cpu_to_le64(realm->parent_ino);
@@ -4546,6 +4676,7 @@ fail:
 static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_session *session)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *reply;
 	int mds = session->s_mds;
 	int err = -ENOMEM;
@@ -4554,7 +4685,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	};
 	LIST_HEAD(dispose);
 
-	pr_info("mds%d reconnect start\n", mds);
+	pr_info_client(cl, "mds%d reconnect start\n", mds);
 
 	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
 	if (!recon_state.pagelist)
@@ -4570,8 +4701,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 	session->s_seq = 0;
 
-	dout("session %p state %s\n", session,
-	     ceph_session_state_name(session->s_state));
+	doutc(cl, "session %p state %s\n", session,
+	      ceph_session_state_name(session->s_state));
 
 	atomic_inc(&session->s_cap_gen);
 
@@ -4705,7 +4836,8 @@ fail:
 fail_nomsg:
 	ceph_pagelist_release(recon_state.pagelist);
 fail_nopagelist:
-	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+	pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
+		      err, mds);
 	return;
 }
 
@@ -4724,9 +4856,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 	int oldstate, newstate;
 	struct ceph_mds_session *s;
 	unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
+	struct ceph_client *cl = mdsc->fsc->client;
 
-	dout("check_new_map new %u old %u\n",
-	     newmap->m_epoch, oldmap->m_epoch);
+	doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch);
 
 	if (newmap->m_info) {
 		for (i = 0; i < newmap->possible_max_rank; i++) {
@@ -4742,12 +4874,12 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		oldstate = ceph_mdsmap_get_state(oldmap, i);
 		newstate = ceph_mdsmap_get_state(newmap, i);
 
-		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
-		     i, ceph_mds_state_name(oldstate),
-		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
-		     ceph_mds_state_name(newstate),
-		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
-		     ceph_session_state_name(s->s_state));
+		doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n",
+		      i, ceph_mds_state_name(oldstate),
+		      ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
+		      ceph_mds_state_name(newstate),
+		      ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
+		      ceph_session_state_name(s->s_state));
 
 		if (i >= newmap->possible_max_rank) {
 			/* force close session for stopped mds */
@@ -4800,7 +4932,8 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
 			if (oldstate != CEPH_MDS_STATE_CREATING &&
 			    oldstate != CEPH_MDS_STATE_STARTING)
-				pr_info("mds%d recovery completed\n", s->s_mds);
+				pr_info_client(cl, "mds%d recovery completed\n",
+					       s->s_mds);
 			kick_requests(mdsc, i);
 			mutex_unlock(&mdsc->mutex);
 			mutex_lock(&s->s_mutex);
@@ -4844,12 +4977,13 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			s = __open_export_target_session(mdsc, i);
 			if (IS_ERR(s)) {
 				err = PTR_ERR(s);
-				pr_err("failed to open export target session, err %d\n",
-				       err);
+				pr_err_client(cl,
+					      "failed to open export target session, err %d\n",
+					      err);
 				continue;
 			}
 		}
-		dout("send reconnect to export target mds.%d\n", i);
+		doutc(cl, "send reconnect to export target mds.%d\n", i);
 		mutex_unlock(&mdsc->mutex);
 		send_mds_reconnect(mdsc, s);
 		ceph_put_mds_session(s);
@@ -4865,8 +4999,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
 		    s->s_state == CEPH_MDS_SESSION_HUNG ||
 		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
-			dout(" connecting to export targets of laggy mds%d\n",
-			     i);
+			doutc(cl, " connecting to export targets of laggy mds%d\n", i);
 			__open_export_target_sessions(mdsc, s);
 		}
 	}
@@ -4893,6 +5026,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session,
 			 struct ceph_msg *msg)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct super_block *sb = mdsc->fsc->sb;
 	struct inode *inode;
 	struct dentry *parent, *dentry;
@@ -4904,7 +5038,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 	struct qstr dname;
 	int release = 0;
 
-	dout("handle_lease from mds%d\n", mds);
+	doutc(cl, "from mds%d\n", mds);
 
 	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
 		return;
@@ -4922,20 +5056,19 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 	/* lookup inode */
 	inode = ceph_find_inode(sb, vino);
-	dout("handle_lease %s, ino %llx %p %.*s\n",
-	     ceph_lease_op_name(h->action), vino.ino, inode,
-	     dname.len, dname.name);
+	doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action),
+	      vino.ino, inode, dname.len, dname.name);
 
 	mutex_lock(&session->s_mutex);
 	if (!inode) {
-		dout("handle_lease no inode %llx\n", vino.ino);
+		doutc(cl, "no inode %llx\n", vino.ino);
 		goto release;
 	}
 
 	/* dentry */
 	parent = d_find_alias(inode);
 	if (!parent) {
-		dout("no parent dentry on inode %p\n", inode);
+		doutc(cl, "no parent dentry on inode %p\n", inode);
 		WARN_ON(1);
 		goto release;  /* hrm... */
 	}
@@ -4995,7 +5128,7 @@ out:
 bad:
 	ceph_dec_mds_stopping_blocker(mdsc);
 
-	pr_err("corrupt lease message\n");
+	pr_err_client(cl, "corrupt lease message\n");
 	ceph_msg_dump(msg);
 }
 
@@ -5003,13 +5136,14 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 			      struct dentry *dentry, char action,
 			      u32 seq)
 {
+	struct ceph_client *cl = session->s_mdsc->fsc->client;
 	struct ceph_msg *msg;
 	struct ceph_mds_lease *lease;
 	struct inode *dir;
 	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
 
-	dout("lease_send_msg identry %p %s to mds%d\n",
-	     dentry, ceph_lease_op_name(action), session->s_mds);
+	doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action),
+	      session->s_mds);
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
 	if (!msg)
@@ -5042,6 +5176,7 @@ static void lock_unlock_session(struct ceph_mds_session *s)
 
 static void maybe_recover_session(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
@@ -5053,17 +5188,19 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	if (!READ_ONCE(fsc->blocklisted))
 		return;
 
-	pr_info("auto reconnect after blocklisted\n");
+	pr_info_client(cl, "auto reconnect after blocklisted\n");
 	ceph_force_reconnect(fsc->sb);
 }
 
 bool check_session_state(struct ceph_mds_session *s)
 {
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
+
 	switch (s->s_state) {
 	case CEPH_MDS_SESSION_OPEN:
 		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
 			s->s_state = CEPH_MDS_SESSION_HUNG;
-			pr_info("mds%d hung\n", s->s_mds);
+			pr_info_client(cl, "mds%d hung\n", s->s_mds);
 		}
 		break;
 	case CEPH_MDS_SESSION_CLOSING:
@@ -5083,6 +5220,8 @@ bool check_session_state(struct ceph_mds_session *s)
  */
 void inc_session_sequence(struct ceph_mds_session *s)
 {
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
+
 	lockdep_assert_held(&s->s_mutex);
 
 	s->s_seq++;
@@ -5090,11 +5229,11 @@ void inc_session_sequence(struct ceph_mds_session *s)
 	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
 		int ret;
 
-		dout("resending session close request for mds%d\n", s->s_mds);
+		doutc(cl, "resending session close request for mds%d\n", s->s_mds);
 		ret = request_close_session(s);
 		if (ret < 0)
-			pr_err("unable to close session to mds%d: %d\n",
-			       s->s_mds, ret);
+			pr_err_client(cl, "unable to close session to mds%d: %d\n",
+				      s->s_mds, ret);
 	}
 }
 
@@ -5123,7 +5262,7 @@ static void delayed_work(struct work_struct *work)
 	int renew_caps;
 	int i;
 
-	dout("mdsc delayed_work\n");
+	doutc(mdsc->fsc->client, "mdsc delayed_work\n");
 
 	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
 		return;
@@ -5252,6 +5391,7 @@ err_mdsc:
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req;
 
@@ -5259,25 +5399,25 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 	if (__get_oldest_req(mdsc)) {
 		mutex_unlock(&mdsc->mutex);
 
-		dout("wait_requests waiting for requests\n");
+		doutc(cl, "waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
 				    ceph_timeout_jiffies(opts->mount_timeout));
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
 		while ((req = __get_oldest_req(mdsc))) {
-			dout("wait_requests timed out on tid %llu\n",
-			     req->r_tid);
+			doutc(cl, "timed out on tid %llu\n", req->r_tid);
 			list_del_init(&req->r_wait);
 			__unregister_request(mdsc, req);
 		}
 	}
 	mutex_unlock(&mdsc->mutex);
-	dout("wait_requests done\n");
+	doutc(cl, "done\n");
 }
 
 void send_flush_mdlog(struct ceph_mds_session *s)
 {
+	struct ceph_client *cl = s->s_mdsc->fsc->client;
 	struct ceph_msg *msg;
 
 	/*
@@ -5287,13 +5427,13 @@ void send_flush_mdlog(struct ceph_mds_session *s)
 		return;
 
 	mutex_lock(&s->s_mutex);
-	dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
-	     ceph_session_state_name(s->s_state), s->s_seq);
+	doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n",
+	      s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
 	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
 				      s->s_seq);
 	if (!msg) {
-		pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
-		       s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+		pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n",
+			      s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
 	} else {
 		ceph_con_send(&s->s_con, msg);
 	}
@@ -5306,7 +5446,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
  */
 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 {
-	dout("pre_umount\n");
+	doutc(mdsc->fsc->client, "begin\n");
 	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
 
 	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
@@ -5321,6 +5461,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	ceph_msgr_flush();
 
 	ceph_cleanup_quotarealms_inodes(mdsc);
+	doutc(mdsc->fsc->client, "done\n");
 }
 
 /*
@@ -5329,12 +5470,13 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
 						 u64 want_tid)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_request *req = NULL, *nextreq;
 	struct ceph_mds_session *last_session = NULL;
 	struct rb_node *n;
 
 	mutex_lock(&mdsc->mutex);
-	dout("%s want %lld\n", __func__, want_tid);
+	doutc(cl, "want %lld\n", want_tid);
 restart:
 	req = __get_oldest_req(mdsc);
 	while (req && req->r_tid <= want_tid) {
@@ -5368,8 +5510,8 @@ restart:
 			} else {
 				ceph_put_mds_session(s);
 			}
-			dout("%s wait on %llu (want %llu)\n", __func__,
-			     req->r_tid, want_tid);
+			doutc(cl, "wait on %llu (want %llu)\n",
+			      req->r_tid, want_tid);
 			wait_for_completion(&req->r_safe_completion);
 
 			mutex_lock(&mdsc->mutex);
@@ -5387,17 +5529,18 @@ restart:
 	}
 	mutex_unlock(&mdsc->mutex);
 	ceph_put_mds_session(last_session);
-	dout("%s done\n", __func__);
+	doutc(cl, "done\n");
 }
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	u64 want_tid, want_flush;
 
 	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
 		return;
 
-	dout("sync\n");
+	doutc(cl, "sync\n");
 	mutex_lock(&mdsc->mutex);
 	want_tid = mdsc->last_tid;
 	mutex_unlock(&mdsc->mutex);
@@ -5413,8 +5556,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
 
-	dout("sync want tid %lld flush_seq %lld\n",
-	     want_tid, want_flush);
+	doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 
 	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
 	wait_caps_flush(mdsc, want_flush);
@@ -5436,11 +5578,12 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_options *opts = mdsc->fsc->client->options;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_session *session;
 	int i;
 	int skipped = 0;
 
-	dout("close_sessions\n");
+	doutc(cl, "begin\n");
 
 	/* close sessions */
 	mutex_lock(&mdsc->mutex);
@@ -5458,7 +5601,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	}
 	mutex_unlock(&mdsc->mutex);
 
-	dout("waiting for sessions to close\n");
+	doutc(cl, "waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq,
 			   done_closing_sessions(mdsc, skipped),
 			   ceph_timeout_jiffies(opts->mount_timeout));
@@ -5486,7 +5629,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	cancel_work_sync(&mdsc->cap_reclaim_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
 
-	dout("stopped\n");
+	doutc(cl, "done\n");
 }
 
 void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
@@ -5494,7 +5637,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
 	struct ceph_mds_session *session;
 	int mds;
 
-	dout("force umount\n");
+	doutc(mdsc->fsc->client, "force umount\n");
 
 	mutex_lock(&mdsc->mutex);
 	for (mds = 0; mds < mdsc->max_sessions; mds++) {
@@ -5525,7 +5668,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
 
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
-	dout("stop\n");
+	doutc(mdsc->fsc->client, "stop\n");
 	/*
 	 * Make sure the delayed work stopped before releasing
 	 * the resources.
@@ -5546,7 +5689,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 {
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	dout("mdsc_destroy %p\n", mdsc);
+	doutc(fsc->client, "%p\n", mdsc);
 
 	if (!mdsc)
 		return;
@@ -5560,12 +5703,13 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 
 	fsc->mdsc = NULL;
 	kfree(mdsc);
-	dout("mdsc_destroy %p done\n", mdsc);
+	doutc(fsc->client, "%p done\n", mdsc);
 }
 
 void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 {
 	struct ceph_fs_client *fsc = mdsc->fsc;
+	struct ceph_client *cl = fsc->client;
 	const char *mds_namespace = fsc->mount_options->mds_namespace;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
@@ -5577,7 +5721,7 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	ceph_decode_need(&p, end, sizeof(u32), bad);
 	epoch = ceph_decode_32(&p);
 
-	dout("handle_fsmap epoch %u\n", epoch);
+	doutc(cl, "epoch %u\n", epoch);
 
 	/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
 	ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
@@ -5622,7 +5766,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	return;
 
 bad:
-	pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
+	pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n",
+		      err);
 	ceph_umount_begin(mdsc->fsc->sb);
 	ceph_msg_dump(msg);
 err_out:
@@ -5637,6 +5782,7 @@ err_out:
  */
 void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	u32 epoch;
 	u32 maplen;
 	void *p = msg->front.iov_base;
@@ -5651,18 +5797,17 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
-	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+	doutc(cl, "epoch %u len %d\n", epoch, (int)maplen);
 
 	/* do we need it? */
 	mutex_lock(&mdsc->mutex);
 	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
-		dout("handle_map epoch %u <= our %u\n",
-		     epoch, mdsc->mdsmap->m_epoch);
+		doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch);
 		mutex_unlock(&mdsc->mutex);
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
+	newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client));
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
@@ -5691,7 +5836,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 bad_unlock:
 	mutex_unlock(&mdsc->mutex);
 bad:
-	pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
+	pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n",
+		      err);
 	ceph_umount_begin(mdsc->fsc->sb);
 	ceph_msg_dump(msg);
 	return;
@@ -5722,7 +5868,8 @@ static void mds_peer_reset(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 
-	pr_warn("mds%d closed our session\n", s->s_mds);
+	pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
+		       s->s_mds);
 	if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
 		send_mds_reconnect(mdsc, s);
 }
@@ -5731,6 +5878,7 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_client *cl = mdsc->fsc->client;
 	int type = le16_to_cpu(msg->hdr.type);
 
 	mutex_lock(&mdsc->mutex);
@@ -5770,8 +5918,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		break;
 
 	default:
-		pr_err("received unknown message type %d %s\n", type,
-		       ceph_msg_type_name(type));
+		pr_err_client(cl, "received unknown message type %d %s\n",
+			      type, ceph_msg_type_name(type));
 	}
 out:
 	ceph_msg_put(msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5a3714bdd64a..2e6ddaa13d72 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -14,9 +14,9 @@
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
-#include <linux/ceph/mdsmap.h>
 #include <linux/ceph/auth.h>
 
+#include "mdsmap.h"
 #include "metric.h"
 #include "super.h"
 
@@ -33,8 +33,10 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
 	CEPHFS_FEATURE_OP_GETVXATTR,
 	CEPHFS_FEATURE_32BITS_RETRY_FWD,
+	CEPHFS_FEATURE_NEW_SNAPREALM_INFO,
+	CEPHFS_FEATURE_HAS_OWNER_UIDGID,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID,
 };
 
 #define CEPHFS_FEATURES_CLIENT_SUPPORTED {	\
@@ -49,6 +51,7 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,	\
 	CEPHFS_FEATURE_OP_GETVXATTR,		\
 	CEPHFS_FEATURE_32BITS_RETRY_FWD,	\
+	CEPHFS_FEATURE_HAS_OWNER_UIDGID,	\
 }
 
 /*
@@ -300,6 +303,7 @@ struct ceph_mds_request {
 	int r_fmode;        /* file mode, if expecting cap */
 	int r_request_release_offset;
 	const struct cred *r_cred;
+	struct mnt_idmap *r_mnt_idmap;
 	struct timespec64 r_stamp;
 
 	/* for choosing which mds to send this request to */
@@ -581,7 +585,8 @@ static inline void ceph_mdsc_free_path(char *path, int len)
 		__putname(path - (PATH_MAX - 1 - len));
 }
 
-extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
+				  struct dentry *dentry, int *plen, u64 *base,
 				  int for_wire);
 
 extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
@@ -614,4 +619,6 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
 extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
 extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
+
+extern bool enable_unsafe_idmap;
 #endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 7dac21ee6ce7..fae97c25ce58 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -7,10 +7,11 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include <linux/ceph/mdsmap.h>
 #include <linux/ceph/messenger.h>
 #include <linux/ceph/decode.h>
 
+#include "mdsmap.h"
+#include "mds_client.h"
 #include "super.h"
 
 #define CEPH_MDS_IS_READY(i, ignore_laggy) \
@@ -114,8 +115,10 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mdsmap *m;
 	const void *start = *p;
 	int i, j, n;
@@ -233,20 +236,18 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 			*p = info_end;
 		}
 
-		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
-		     i+1, n, global_id, mds, inc,
-		     ceph_pr_addr(&addr),
-		     ceph_mds_state_name(state),
-		     laggy ? "(laggy)" : "");
+		doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id,
+		      mds, inc, ceph_pr_addr(&addr),
+		      ceph_mds_state_name(state), laggy ? "(laggy)" : "");
 
 		if (mds < 0 || mds >= m->possible_max_rank) {
-			pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
+			pr_warn_client(cl, "got incorrect mds(%d)\n", mds);
 			continue;
 		}
 
 		if (state <= 0) {
-			dout("mdsmap_decode got incorrect state(%s)\n",
-			     ceph_mds_state_name(state));
+			doutc(cl, "got incorrect state(%s)\n",
+			      ceph_mds_state_name(state));
 			continue;
 		}
 
@@ -385,16 +386,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 		m->m_max_xattr_size = 0;
 	}
 bad_ext:
-	dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
-	     !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
+	doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+	      !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
 	*p = end;
-	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	doutc(cl, "success epoch %u\n", m->m_epoch);
 	return m;
 nomem:
 	err = -ENOMEM;
 	goto out_err;
 corrupt:
-	pr_err("corrupt mdsmap\n");
+	pr_err_client(cl, "corrupt mdsmap\n");
 	print_hex_dump(KERN_DEBUG, "mdsmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..89f1931f1ba6
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+struct ceph_mds_client;
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	u64 global_id;
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	bool laggy;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
+	u32 m_max_mds;			/* expected up:active mds number */
+	u32 m_num_active_mds;		/* actual up:active mds number */
+	u32 possible_max_rank;		/* possible max rank index */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u64 *m_data_pg_pools;
+	u64 m_cas_pg_pool;
+
+	bool m_enabled;
+	bool m_damaged;
+	int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->possible_max_rank)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->possible_max_rank)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+	if (w >= 0 && w < m->possible_max_rank)
+		return m->m_info[w].laggy;
+	return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
+				       void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 6d3584f16f9a..871c1090e520 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -31,6 +31,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	struct ceph_client_metric *m = &mdsc->metric;
 	u64 nr_caps = atomic64_read(&m->total_caps);
 	u32 header_len = sizeof(struct ceph_metric_header);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_msg *msg;
 	s64 sum;
 	s32 items = 0;
@@ -51,8 +52,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
 	if (!msg) {
-		pr_err("send metrics to mds%d, failed to allocate message\n",
-		       s->s_mds);
+		pr_err_client(cl, "to mds%d, failed to allocate message\n",
+			      s->s_mds);
 		return false;
 	}
 
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index f7fcf7f08ec6..9d36c3532de1 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -43,6 +43,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 {
 	struct super_block *sb = mdsc->fsc->sb;
 	struct ceph_mds_quota *h = msg->front.iov_base;
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_vino vino;
 	struct inode *inode;
 	struct ceph_inode_info *ci;
@@ -51,8 +52,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 		return;
 
 	if (msg->front.iov_len < sizeof(*h)) {
-		pr_err("%s corrupt message mds%d len %d\n", __func__,
-		       session->s_mds, (int)msg->front.iov_len);
+		pr_err_client(cl, "corrupt message mds%d len %d\n",
+			      session->s_mds, (int)msg->front.iov_len);
 		ceph_msg_dump(msg);
 		goto out;
 	}
@@ -62,7 +63,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 	vino.snap = CEPH_NOSNAP;
 	inode = ceph_find_inode(sb, vino);
 	if (!inode) {
-		pr_warn("Failed to find inode %llu\n", vino.ino);
+		pr_warn_client(cl, "failed to find inode %llx\n", vino.ino);
 		goto out;
 	}
 	ci = ceph_inode(inode);
@@ -85,6 +86,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
 {
 	struct ceph_quotarealm_inode *qri = NULL;
 	struct rb_node **node, *parent = NULL;
+	struct ceph_client *cl = mdsc->fsc->client;
 
 	mutex_lock(&mdsc->quotarealms_inodes_mutex);
 	node = &(mdsc->quotarealms_inodes.rb_node);
@@ -110,7 +112,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
 			rb_link_node(&qri->node, parent, node);
 			rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
 		} else
-			pr_warn("Failed to alloc quotarealms_inode\n");
+			pr_warn_client(cl, "Failed to alloc quotarealms_inode\n");
 	}
 	mutex_unlock(&mdsc->quotarealms_inodes_mutex);
 
@@ -129,6 +131,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
 					     struct super_block *sb,
 					     struct ceph_snap_realm *realm)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_quotarealm_inode *qri;
 	struct inode *in;
 
@@ -161,8 +164,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
 	}
 
 	if (IS_ERR(in)) {
-		dout("Can't lookup inode %llx (err: %ld)\n",
-		     realm->ino, PTR_ERR(in));
+		doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino,
+		      PTR_ERR(in));
 		qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
 	} else {
 		qri->timeout = 0;
@@ -213,6 +216,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
 					       enum quota_get_realm which_quota,
 					       bool retry)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci = NULL;
 	struct ceph_snap_realm *realm, *next;
 	struct inode *in;
@@ -226,8 +230,9 @@ restart:
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
 	else
-		pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
-				   "null i_snap_realm\n", ceph_vinop(inode));
+		pr_err_ratelimited_client(cl,
+				"%p %llx.%llx null i_snap_realm\n",
+				inode, ceph_vinop(inode));
 	while (realm) {
 		bool has_inode;
 
@@ -317,6 +322,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
 				 loff_t delta)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct ceph_snap_realm *realm, *next;
 	struct inode *in;
@@ -332,8 +338,9 @@ restart:
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
 	else
-		pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
-				   "null i_snap_realm\n", ceph_vinop(inode));
+		pr_err_ratelimited_client(cl,
+				"%p %llx.%llx null i_snap_realm\n",
+				inode, ceph_vinop(inode));
 	while (realm) {
 		bool has_inode;
 
@@ -383,7 +390,7 @@ restart:
 			break;
 		default:
 			/* Shouldn't happen */
-			pr_warn("Invalid quota check op (%d)\n", op);
+			pr_warn_client(cl, "Invalid quota check op (%d)\n", op);
 			exceeded = true; /* Just break the loop */
 		}
 		iput(in);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 813f21add992..c65f2b202b2b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -138,7 +138,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 	__insert_snap_realm(&mdsc->snap_realms, realm);
 	mdsc->num_snap_realms++;
 
-	dout("%s %llx %p\n", __func__, realm->ino, realm);
+	doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm);
 	return realm;
 }
 
@@ -150,6 +150,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
 						   u64 ino)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct rb_node *n = mdsc->snap_realms.rb_node;
 	struct ceph_snap_realm *r;
 
@@ -162,7 +163,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
 		else if (ino > r->ino)
 			n = n->rb_right;
 		else {
-			dout("%s %llx %p\n", __func__, r->ino, r);
+			doutc(cl, "%llx %p\n", r->ino, r);
 			return r;
 		}
 	}
@@ -188,9 +189,10 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
 static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
 				 struct ceph_snap_realm *realm)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
 
-	dout("%s %p %llx\n", __func__, realm, realm->ino);
+	doutc(cl, "%p %llx\n", realm, realm->ino);
 
 	rb_erase(&realm->node, &mdsc->snap_realms);
 	mdsc->num_snap_realms--;
@@ -290,6 +292,7 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
 				    struct ceph_snap_realm *realm,
 				    u64 parentino)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_snap_realm *parent;
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -303,8 +306,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
 		if (IS_ERR(parent))
 			return PTR_ERR(parent);
 	}
-	dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
-	     realm, realm->parent_ino, realm->parent, parentino, parent);
+	doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm,
+	      realm->parent_ino, realm->parent, parentino, parent);
 	if (realm->parent) {
 		list_del_init(&realm->child_item);
 		ceph_put_snap_realm(mdsc, realm->parent);
@@ -329,10 +332,12 @@ static int cmpu64_rev(const void *a, const void *b)
 /*
  * build the snap context for a given realm.
  */
-static int build_snap_context(struct ceph_snap_realm *realm,
+static int build_snap_context(struct ceph_mds_client *mdsc,
+			      struct ceph_snap_realm *realm,
 			      struct list_head *realm_queue,
 			      struct list_head *dirty_realms)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_snap_realm *parent = realm->parent;
 	struct ceph_snap_context *snapc;
 	int err = 0;
@@ -360,10 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
 	    realm->cached_context->seq == realm->seq &&
 	    (!parent ||
 	     realm->cached_context->seq >= parent->cached_context->seq)) {
-		dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
-		     __func__, realm->ino, realm, realm->cached_context,
-		     realm->cached_context->seq,
-		     (unsigned int)realm->cached_context->num_snaps);
+		doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+		      realm->ino, realm, realm->cached_context,
+		      realm->cached_context->seq,
+		      (unsigned int)realm->cached_context->num_snaps);
 		return 0;
 	}
 
@@ -400,8 +405,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
 
 	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
 	snapc->num_snaps = num;
-	dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
-	     realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
+	doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm,
+	      snapc, snapc->seq, (unsigned int) snapc->num_snaps);
 
 	ceph_put_snap_context(realm->cached_context);
 	realm->cached_context = snapc;
@@ -418,16 +423,18 @@ fail:
 		ceph_put_snap_context(realm->cached_context);
 		realm->cached_context = NULL;
 	}
-	pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
+	pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err);
 	return err;
 }
 
 /*
  * rebuild snap context for the given realm and all of its children.
  */
-static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+static void rebuild_snap_realms(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm,
 				struct list_head *dirty_realms)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	LIST_HEAD(realm_queue);
 	int last = 0;
 	bool skip = false;
@@ -451,9 +458,10 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm,
 			continue;
 		}
 
-		last = build_snap_context(_realm, &realm_queue, dirty_realms);
-		dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
-		     last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+		last = build_snap_context(mdsc, _realm, &realm_queue,
+					  dirty_realms);
+		doutc(cl, "%llx %p, %s\n", realm->ino, realm,
+		      last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
 
 		/* is any child in the list ? */
 		list_for_each_entry(child, &_realm->children, child_item) {
@@ -523,6 +531,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 				struct ceph_cap_snap **pcapsnap)
 {
 	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_snap_context *old_snapc, *new_snapc;
 	struct ceph_cap_snap *capsnap = *pcapsnap;
 	struct ceph_buffer *old_blob = NULL;
@@ -548,14 +557,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		   as no new writes are allowed to start when pending, so any
 		   writes in progress now were started before the previous
 		   cap_snap.  lucky us. */
-		dout("%s %p %llx.%llx already pending\n",
-		     __func__, inode, ceph_vinop(inode));
+		doutc(cl, "%p %llx.%llx already pending\n", inode,
+		      ceph_vinop(inode));
 		goto update_snapc;
 	}
 	if (ci->i_wrbuffer_ref_head == 0 &&
 	    !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
-		dout("%s %p %llx.%llx nothing dirty|writing\n",
-		     __func__, inode, ceph_vinop(inode));
+		doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode,
+		      ceph_vinop(inode));
 		goto update_snapc;
 	}
 
@@ -575,15 +584,15 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 	} else {
 		if (!(used & CEPH_CAP_FILE_WR) &&
 		    ci->i_wrbuffer_ref_head == 0) {
-			dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
-			     __func__, inode, ceph_vinop(inode));
+			doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n",
+			      inode, ceph_vinop(inode));
 			goto update_snapc;
 		}
 	}
 
-	dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
-	     __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
-	     ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
+	doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+	      inode, ceph_vinop(inode), capsnap, old_snapc,
+	      ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
 	ihold(inode);
 
 	capsnap->follows = old_snapc->seq;
@@ -615,9 +624,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 	list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
 
 	if (used & CEPH_CAP_FILE_WR) {
-		dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
-		     " now pending\n", __func__, inode, ceph_vinop(inode),
-		     capsnap, old_snapc, old_snapc->seq);
+		doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+		      " now pending\n", inode, ceph_vinop(inode), capsnap,
+		      old_snapc, old_snapc->seq);
 		capsnap->writing = 1;
 	} else {
 		/* note mtime, size NOW. */
@@ -634,7 +643,7 @@ update_snapc:
 		ci->i_head_snapc = NULL;
 	} else {
 		ci->i_head_snapc = ceph_get_snap_context(new_snapc);
-		dout(" new snapc is %p\n", new_snapc);
+		doutc(cl, " new snapc is %p\n", new_snapc);
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -655,11 +664,12 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 {
 	struct inode *inode = &ci->netfs.inode;
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+	struct ceph_client *cl = mdsc->fsc->client;
 
 	BUG_ON(capsnap->writing);
 	capsnap->size = i_size_read(inode);
-	capsnap->mtime = inode->i_mtime;
-	capsnap->atime = inode->i_atime;
+	capsnap->mtime = inode_get_mtime(inode);
+	capsnap->atime = inode_get_atime(inode);
 	capsnap->ctime = inode_get_ctime(inode);
 	capsnap->btime = ci->i_btime;
 	capsnap->change_attr = inode_peek_iversion_raw(inode);
@@ -667,11 +677,12 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	capsnap->truncate_size = ci->i_truncate_size;
 	capsnap->truncate_seq = ci->i_truncate_seq;
 	if (capsnap->dirty_pages) {
-		dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
-		     "still has %d dirty pages\n", __func__, inode,
-		     ceph_vinop(inode), capsnap, capsnap->context,
-		     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
-		     capsnap->size, capsnap->dirty_pages);
+		doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+		      "s=%llu still has %d dirty pages\n", inode,
+		      ceph_vinop(inode), capsnap, capsnap->context,
+		      capsnap->context->seq,
+		      ceph_cap_string(capsnap->dirty),
+		      capsnap->size, capsnap->dirty_pages);
 		return 0;
 	}
 
@@ -680,20 +691,20 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	 * And trigger to flush the buffer immediately.
 	 */
 	if (ci->i_wrbuffer_ref) {
-		dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
-		     "used WRBUFFER, delaying\n", __func__, inode,
-		     ceph_vinop(inode), capsnap, capsnap->context,
-		     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
-		     capsnap->size);
+		doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s "
+		      "s=%llu used WRBUFFER, delaying\n", inode,
+		      ceph_vinop(inode), capsnap, capsnap->context,
+		      capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+		      capsnap->size);
 		ceph_queue_writeback(inode);
 		return 0;
 	}
 
 	ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
-	dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
-	     __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
-	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
-	     capsnap->size);
+	doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+	      inode, ceph_vinop(inode), capsnap, capsnap->context,
+	      capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	      capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
 	if (list_empty(&ci->i_snap_flush_item)) {
@@ -708,13 +719,15 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
  * Queue cap_snaps for snap writeback for this realm and its children.
  * Called under snap_rwsem, so realm topology won't change.
  */
-static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc,
+				  struct ceph_snap_realm *realm)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct inode *lastinode = NULL;
 	struct ceph_cap_snap *capsnap = NULL;
 
-	dout("%s %p %llx inode\n", __func__, realm, realm->ino);
+	doutc(cl, "%p %llx inode\n", realm, realm->ino);
 
 	spin_lock(&realm->inodes_with_caps_lock);
 	list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -733,8 +746,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
 		if (!capsnap) {
 			capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
 			if (!capsnap) {
-				pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
-				       inode);
+				pr_err_client(cl,
+					"ENOMEM allocating ceph_cap_snap on %p\n",
+					inode);
 				return;
 			}
 		}
@@ -752,7 +766,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
 
 	if (capsnap)
 		kmem_cache_free(ceph_cap_snap_cachep, capsnap);
-	dout("%s %p %llx done\n", __func__, realm, realm->ino);
+	doutc(cl, "%p %llx done\n", realm, realm->ino);
 }
 
 /*
@@ -766,6 +780,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 			   void *p, void *e, bool deletion,
 			   struct ceph_snap_realm **realm_ret)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_mds_snap_realm *ri;    /* encoded */
 	__le64 *snaps;                     /* encoded */
 	__le64 *prior_parent_snaps;        /* encoded */
@@ -780,7 +795,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
 
-	dout("%s deletion=%d\n", __func__, deletion);
+	doutc(cl, "deletion=%d\n", deletion);
 more:
 	realm = NULL;
 	rebuild_snapcs = 0;
@@ -810,8 +825,8 @@ more:
 	rebuild_snapcs += err;
 
 	if (le64_to_cpu(ri->seq) > realm->seq) {
-		dout("%s updating %llx %p %lld -> %lld\n", __func__,
-		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino,
+		      realm, realm->seq, le64_to_cpu(ri->seq));
 		/* update realm parameters, snap lists */
 		realm->seq = le64_to_cpu(ri->seq);
 		realm->created = le64_to_cpu(ri->created);
@@ -834,16 +849,16 @@ more:
 
 		rebuild_snapcs = 1;
 	} else if (!realm->cached_context) {
-		dout("%s %llx %p seq %lld new\n", __func__,
-		     realm->ino, realm, realm->seq);
+		doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm,
+		      realm->seq);
 		rebuild_snapcs = 1;
 	} else {
-		dout("%s %llx %p seq %lld unchanged\n", __func__,
-		     realm->ino, realm, realm->seq);
+		doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm,
+		      realm->seq);
 	}
 
-	dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
-	     realm, rebuild_snapcs, p, e);
+	doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+	      realm, rebuild_snapcs, p, e);
 
 	/*
 	 * this will always track the uppest parent realm from which
@@ -855,7 +870,7 @@ more:
 
 	/* rebuild_snapcs when we reach the _end_ (root) of the trace */
 	if (realm_to_rebuild && p >= e)
-		rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
+		rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
 
 	if (!first_realm)
 		first_realm = realm;
@@ -873,7 +888,7 @@ more:
 		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
 					 dirty_item);
 		list_del_init(&realm->dirty_item);
-		queue_realm_cap_snaps(realm);
+		queue_realm_cap_snaps(mdsc, realm);
 	}
 
 	if (realm_ret)
@@ -891,7 +906,7 @@ fail:
 		ceph_put_snap_realm(mdsc, realm);
 	if (first_realm)
 		ceph_put_snap_realm(mdsc, first_realm);
-	pr_err("%s error %d\n", __func__, err);
+	pr_err_client(cl, "error %d\n", err);
 
 	/*
 	 * When receiving a corrupted snap trace we don't know what
@@ -905,11 +920,12 @@ fail:
 	WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
 	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
 	if (ret)
-		pr_err("%s failed to blocklist %s: %d\n", __func__,
-		       ceph_pr_addr(&client->msgr.inst.addr), ret);
+		pr_err_client(cl, "failed to blocklist %s: %d\n",
+			      ceph_pr_addr(&client->msgr.inst.addr), ret);
 
-	WARN(1, "%s: %s%sdo remount to continue%s",
-	     __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
+	WARN(1, "[client.%lld] %s %s%sdo remount to continue%s",
+	     client->monc.auth->global_id, __func__,
+	     ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
 	     ret ? "" : " was blocklisted, ",
 	     err == -EIO ? " after corrupted snaptrace is fixed" : "");
 
@@ -925,11 +941,12 @@ fail:
  */
 static void flush_snaps(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_inode_info *ci;
 	struct inode *inode;
 	struct ceph_mds_session *session = NULL;
 
-	dout("%s\n", __func__);
+	doutc(cl, "begin\n");
 	spin_lock(&mdsc->snap_flush_lock);
 	while (!list_empty(&mdsc->snap_flush_list)) {
 		ci = list_first_entry(&mdsc->snap_flush_list,
@@ -944,7 +961,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
 	spin_unlock(&mdsc->snap_flush_lock);
 
 	ceph_put_mds_session(session);
-	dout("%s done\n", __func__);
+	doutc(cl, "done\n");
 }
 
 /**
@@ -960,7 +977,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
 void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc;
 	struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
 
 	lockdep_assert_held(&ci->i_ceph_lock);
@@ -1000,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		      struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct super_block *sb = mdsc->fsc->sb;
 	int mds = session->s_mds;
 	u64 split;
@@ -1030,8 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	trace_len = le32_to_cpu(h->trace_len);
 	p += sizeof(*h);
 
-	dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
-	     mds, ceph_snap_op_name(op), split, trace_len);
+	doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds,
+	      ceph_snap_op_name(op), split, trace_len);
 
 	down_write(&mdsc->snap_rwsem);
 	locked_rwsem = 1;
@@ -1062,7 +1080,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 				goto out;
 		}
 
-		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm);
 		for (i = 0; i < num_split_inos; i++) {
 			struct ceph_vino vino = {
 				.ino = le64_to_cpu(split_inos[i]),
@@ -1087,13 +1105,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			 */
 			if (ci->i_snap_realm->created >
 			    le64_to_cpu(ri->created)) {
-				dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
-				     inode, ceph_vinop(inode), ci->i_snap_realm->ino,
-				     ci->i_snap_realm);
+				doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
+				      inode, ceph_vinop(inode), ci->i_snap_realm->ino,
+				      ci->i_snap_realm);
 				goto skip_inode;
 			}
-			dout(" will move %p %llx.%llx to split realm %llx %p\n",
-			     inode, ceph_vinop(inode), realm->ino, realm);
+			doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
+			      inode, ceph_vinop(inode), realm->ino, realm);
 
 			ceph_get_snap_realm(mdsc, realm);
 			ceph_change_snap_realm(inode, realm);
@@ -1154,7 +1172,7 @@ skip_inode:
 	return;
 
 bad:
-	pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
+	pr_err_client(cl, "corrupt snap message from mds%d\n", mds);
 	ceph_msg_dump(msg);
 out:
 	if (locked_rwsem)
@@ -1170,6 +1188,7 @@ out:
 struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
 					    u64 snap)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_snapid_map *sm, *exist;
 	struct rb_node **p, *parent;
 	int ret;
@@ -1192,8 +1211,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
 	}
 	spin_unlock(&mdsc->snapid_map_lock);
 	if (exist) {
-		dout("%s found snapid map %llx -> %x\n", __func__,
-		     exist->snap, exist->dev);
+		doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+		      exist->dev);
 		return exist;
 	}
 
@@ -1237,13 +1256,12 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
 	if (exist) {
 		free_anon_bdev(sm->dev);
 		kfree(sm);
-		dout("%s found snapid map %llx -> %x\n", __func__,
-		     exist->snap, exist->dev);
+		doutc(cl, "found snapid map %llx -> %x\n", exist->snap,
+		      exist->dev);
 		return exist;
 	}
 
-	dout("%s create snapid map %llx -> %x\n", __func__,
-	     sm->snap, sm->dev);
+	doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev);
 	return sm;
 }
 
@@ -1268,6 +1286,7 @@ void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
 
 void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_snapid_map *sm;
 	unsigned long now;
 	LIST_HEAD(to_free);
@@ -1289,7 +1308,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
 	while (!list_empty(&to_free)) {
 		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
 		list_del(&sm->lru);
-		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+		doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev);
 		free_anon_bdev(sm->dev);
 		kfree(sm);
 	}
@@ -1297,6 +1316,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
 
 void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
 {
+	struct ceph_client *cl = mdsc->fsc->client;
 	struct ceph_snapid_map *sm;
 	struct rb_node *p;
 	LIST_HEAD(to_free);
@@ -1315,8 +1335,8 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
 		list_del(&sm->lru);
 		free_anon_bdev(sm->dev);
 		if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
-			pr_err("snapid map %llx -> %x still in use\n",
-			       sm->snap, sm->dev);
+			pr_err_client(cl, "snapid map %llx -> %x still in use\n",
+				      sm->snap, sm->dev);
 		}
 		kfree(sm);
 	}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2d7f5a8d4a92..5ec102f6b1ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -44,28 +44,29 @@ static LIST_HEAD(ceph_fsc_list);
  */
 static void ceph_put_super(struct super_block *s)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
 
-	dout("put_super\n");
+	doutc(fsc->client, "begin\n");
 	ceph_fscrypt_free_dummy_policy(fsc);
 	ceph_mdsc_close_sessions(fsc->mdsc);
+	doutc(fsc->client, "done\n");
 }
 
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry));
 	struct ceph_mon_client *monc = &fsc->client->monc;
 	struct ceph_statfs st;
 	int i, err;
 	u64 data_pool;
 
+	doutc(fsc->client, "begin\n");
 	if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
 		data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
 	} else {
 		data_pool = CEPH_NOPOOL;
 	}
 
-	dout("statfs\n");
 	err = ceph_monc_do_statfs(monc, data_pool, &st);
 	if (err < 0)
 		return err;
@@ -113,24 +114,26 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/* fold the fs_cluster_id into the upper bits */
 	buf->f_fsid.val[1] = monc->fs_cluster_id;
 
+	doutc(fsc->client, "done\n");
 	return 0;
 }
 
 static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+	struct ceph_client *cl = fsc->client;
 
 	if (!wait) {
-		dout("sync_fs (non-blocking)\n");
+		doutc(cl, "(non-blocking)\n");
 		ceph_flush_dirty_caps(fsc->mdsc);
-		dout("sync_fs (non-blocking) done\n");
+		doutc(cl, "(non-blocking) done\n");
 		return 0;
 	}
 
-	dout("sync_fs (blocking)\n");
+	doutc(cl, "(blocking)\n");
 	ceph_osdc_sync(&fsc->client->osdc);
 	ceph_mdsc_sync(fsc->mdsc);
-	dout("sync_fs (blocking) done\n");
+	doutc(cl, "(blocking) done\n");
 	return 0;
 }
 
@@ -341,7 +344,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
 	char *dev_name = param->string, *dev_name_end;
 	int ret;
 
-	dout("%s '%s'\n", __func__, dev_name);
+	dout("'%s'\n", dev_name);
 	if (!dev_name || !*dev_name)
 		return invalfc(fc, "Empty source");
 
@@ -413,7 +416,7 @@ static int ceph_parse_mount_param(struct fs_context *fc,
 		return ret;
 
 	token = fs_parse(fc, ceph_mount_parameters, param, &result);
-	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+	dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token);
 	if (token < 0)
 		return token;
 
@@ -684,7 +687,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
  */
 static int ceph_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb);
 	struct ceph_mount_options *fsopt = fsc->mount_options;
 	size_t pos;
 	int ret;
@@ -881,7 +884,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc)
 
 static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
-	dout("destroy_fs_client %p\n", fsc);
+	doutc(fsc->client, "%p\n", fsc);
 
 	spin_lock(&ceph_fsc_lock);
 	list_del(&fsc->metric_wakeup);
@@ -896,7 +899,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
 	ceph_destroy_client(fsc->client);
 
 	kfree(fsc);
-	dout("destroy_fs_client %p done\n", fsc);
+	dout("%s: %p done\n", __func__, fsc);
 }
 
 /*
@@ -1015,9 +1018,9 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc)
  */
 void ceph_umount_begin(struct super_block *sb)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 
-	dout("ceph_umount_begin - starting forced umount\n");
+	doutc(fsc->client, "starting forced umount\n");
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
@@ -1045,13 +1048,14 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				       const char *path,
 				       unsigned long started)
 {
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req = NULL;
 	int err;
 	struct dentry *root;
 
 	/* open dir */
-	dout("open_root_inode opening '%s'\n", path);
+	doutc(cl, "opening '%s'\n", path);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 	if (IS_ERR(req))
 		return ERR_CAST(req);
@@ -1071,13 +1075,13 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	if (err == 0) {
 		struct inode *inode = req->r_target_inode;
 		req->r_target_inode = NULL;
-		dout("open_root_inode success\n");
+		doutc(cl, "success\n");
 		root = d_make_root(inode);
 		if (!root) {
 			root = ERR_PTR(-ENOMEM);
 			goto out;
 		}
-		dout("open_root_inode success, root dentry is %p\n", root);
+		doutc(cl, "success, root dentry is %p\n", root);
 	} else {
 		root = ERR_PTR(err);
 	}
@@ -1136,11 +1140,12 @@ static int ceph_apply_test_dummy_encryption(struct super_block *sb,
 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 				      struct fs_context *fc)
 {
+	struct ceph_client *cl = fsc->client;
 	int err;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
 
-	dout("mount start %p\n", fsc);
+	doutc(cl, "mount start %p\n", fsc);
 	mutex_lock(&fsc->client->mount_mutex);
 
 	if (!fsc->sb->s_root) {
@@ -1163,7 +1168,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 		if (err)
 			goto out;
 
-		dout("mount opening path '%s'\n", path);
+		doutc(cl, "mount opening path '%s'\n", path);
 
 		ceph_fs_debugfs_init(fsc);
 
@@ -1178,7 +1183,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 	}
 
 	fsc->mount_state = CEPH_MOUNT_MOUNTED;
-	dout("mount success\n");
+	doutc(cl, "mount success\n");
 	mutex_unlock(&fsc->client->mount_mutex);
 	return root;
 
@@ -1191,9 +1196,10 @@ out:
 static int ceph_set_super(struct super_block *s, struct fs_context *fc)
 {
 	struct ceph_fs_client *fsc = s->s_fs_info;
+	struct ceph_client *cl = fsc->client;
 	int ret;
 
-	dout("set_super %p\n", s);
+	doutc(cl, "%p\n", s);
 
 	s->s_maxbytes = MAX_LFS_FILESIZE;
 
@@ -1226,31 +1232,32 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc)
 	struct ceph_fs_client *new = fc->s_fs_info;
 	struct ceph_mount_options *fsopt = new->mount_options;
 	struct ceph_options *opt = new->client->options;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
+	struct ceph_client *cl = fsc->client;
 
-	dout("ceph_compare_super %p\n", sb);
+	doutc(cl, "%p\n", sb);
 
 	if (compare_mount_options(fsopt, opt, fsc)) {
-		dout("monitor(s)/mount options don't match\n");
+		doutc(cl, "monitor(s)/mount options don't match\n");
 		return 0;
 	}
 	if ((opt->flags & CEPH_OPT_FSID) &&
 	    ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) {
-		dout("fsid doesn't match\n");
+		doutc(cl, "fsid doesn't match\n");
 		return 0;
 	}
 	if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) {
-		dout("flags differ\n");
+		doutc(cl, "flags differ\n");
 		return 0;
 	}
 
 	if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) {
-		dout("client is blocklisted (and CLEANRECOVER is not set)\n");
+		doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n");
 		return 0;
 	}
 
 	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
-		dout("client has been forcibly unmounted\n");
+		doutc(cl, "client has been forcibly unmounted\n");
 		return 0;
 	}
 
@@ -1322,9 +1329,9 @@ static int ceph_get_tree(struct fs_context *fc)
 		goto out;
 	}
 
-	if (ceph_sb_to_client(sb) != fsc) {
+	if (ceph_sb_to_fs_client(sb) != fsc) {
 		destroy_fs_client(fsc);
-		fsc = ceph_sb_to_client(sb);
+		fsc = ceph_sb_to_fs_client(sb);
 		dout("get_sb got existing client %p\n", fsc);
 	} else {
 		dout("get_sb using new client %p\n", fsc);
@@ -1338,8 +1345,9 @@ static int ceph_get_tree(struct fs_context *fc)
 		err = PTR_ERR(res);
 		goto out_splat;
 	}
-	dout("root %p inode %p ino %llx.%llx\n", res,
-	     d_inode(res), ceph_vinop(d_inode(res)));
+
+	doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res,
+		    d_inode(res), ceph_vinop(d_inode(res)));
 	fc->root = fsc->sb->s_root;
 	return 0;
 
@@ -1377,7 +1385,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
 	struct ceph_parse_opts_ctx *pctx = fc->fs_private;
 	struct ceph_mount_options *fsopt = pctx->opts;
 	struct super_block *sb = fc->root->d_sb;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 
 	err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
 	if (err)
@@ -1397,7 +1405,8 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
 		kfree(fsc->mount_options->mon_addr);
 		fsc->mount_options->mon_addr = fsopt->mon_addr;
 		fsopt->mon_addr = NULL;
-		pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
+		pr_notice_client(fsc->client,
+			"monitor addresses recorded, but not used for reconnection");
 	}
 
 	sync_filesystem(sb);
@@ -1516,11 +1525,12 @@ void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
 
 static void ceph_kill_sb(struct super_block *s)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	bool wait;
 
-	dout("kill_sb %p\n", s);
+	doutc(cl, "%p\n", s);
 
 	ceph_mdsc_pre_umount(mdsc);
 	flush_fs_workqueues(fsc);
@@ -1551,9 +1561,9 @@ static void ceph_kill_sb(struct super_block *s)
 					&mdsc->stopping_waiter,
 					fsc->client->options->mount_timeout);
 		if (!timeleft) /* timed out */
-			pr_warn("umount timed out, %ld\n", timeleft);
+			pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
 		else if (timeleft < 0) /* killed */
-			pr_warn("umount was killed, %ld\n", timeleft);
+			pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
 	}
 
 	mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
@@ -1572,13 +1582,13 @@ static struct file_system_type ceph_fs_type = {
 	.name		= "ceph",
 	.init_fs_context = ceph_init_fs_context,
 	.kill_sb	= ceph_kill_sb,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP,
 };
 MODULE_ALIAS_FS("ceph");
 
 int ceph_force_reconnect(struct super_block *sb)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
 	int err = 0;
 
 	fsc->mount_state = CEPH_MOUNT_RECOVER;
@@ -1671,6 +1681,11 @@ static const struct kernel_param_ops param_ops_mount_syntax = {
 module_param_cb(mount_syntax_v1, &param_ops_mount_syntax, &mount_support, 0444);
 module_param_cb(mount_syntax_v2, &param_ops_mount_syntax, &mount_support, 0444);
 
+bool enable_unsafe_idmap = false;
+module_param(enable_unsafe_idmap, bool, 0644);
+MODULE_PARM_DESC(enable_unsafe_idmap,
+		 "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID");
+
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..fe0f64a0acb2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -488,13 +488,13 @@ ceph_inode(const struct inode *inode)
 }
 
 static inline struct ceph_fs_client *
-ceph_inode_to_client(const struct inode *inode)
+ceph_inode_to_fs_client(const struct inode *inode)
 {
 	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
 }
 
 static inline struct ceph_fs_client *
-ceph_sb_to_client(const struct super_block *sb)
+ceph_sb_to_fs_client(const struct super_block *sb)
 {
 	return (struct ceph_fs_client *)sb->s_fs_info;
 }
@@ -502,7 +502,13 @@ ceph_sb_to_client(const struct super_block *sb)
 static inline struct ceph_mds_client *
 ceph_sb_to_mdsc(const struct super_block *sb)
 {
-	return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+	return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc;
+}
+
+static inline struct ceph_client *
+ceph_inode_to_client(const struct inode *inode)
+{
+	return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client;
 }
 
 static inline struct ceph_vino
@@ -558,7 +564,7 @@ static inline u64 ceph_snap(struct inode *inode)
  */
 static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
 {
-	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+	if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32)))
 		return ceph_ino_to_ino32(ino);
 	return ino;
 }
@@ -1094,8 +1100,8 @@ struct ceph_iattr {
 	struct ceph_fscrypt_auth	*fscrypt_auth;
 };
 
-extern int __ceph_setattr(struct inode *inode, struct iattr *attr,
-			  struct ceph_iattr *cia);
+extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
+			  struct iattr *attr, struct ceph_iattr *cia);
 extern int ceph_setattr(struct mnt_idmap *idmap,
 			struct dentry *dentry, struct iattr *attr);
 extern int ceph_getattr(struct mnt_idmap *idmap,
@@ -1106,7 +1112,7 @@ void ceph_inode_shutdown(struct inode *inode);
 static inline bool ceph_inode_is_shutdown(struct inode *inode)
 {
 	unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
 	int state = READ_ONCE(fsc->mount_state);
 
 	return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
@@ -1119,7 +1125,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
 extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern const struct xattr_handler *ceph_xattr_handlers[];
+extern const struct xattr_handler * const ceph_xattr_handlers[];
 
 struct ceph_acl_sec_ctx {
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
@@ -1223,7 +1229,8 @@ extern void ceph_add_cap(struct inode *inode,
 			 unsigned cap, unsigned seq, u64 realmino, int flags,
 			 struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
-extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
+extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+			    bool queue_release);
 extern void __ceph_remove_caps(struct ceph_inode_info *ci);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0deae4a0f5f1..e066a556eccb 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
 static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				    size_t size)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
+	struct ceph_client *cl = fsc->client;
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_string *pool_ns;
 	s64 pool = ci->i_layout.pool_id;
@@ -69,7 +70,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 
 	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
 
-	dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
+	doutc(cl, "%p\n", &ci->netfs.inode);
 	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name) {
@@ -161,7 +162,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 					 char *val, size_t size)
 {
 	ssize_t ret;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	s64 pool = ci->i_layout.pool_id;
 	const char *pool_name;
@@ -313,7 +314,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
 static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
 					  char *val, size_t size)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
 
 	return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
 }
@@ -321,7 +322,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
 static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
 				       char *val, size_t size)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb);
 
 	return ceph_fmt_xattr(val, size, "client%lld",
 			      ceph_client_gid(fsc->client));
@@ -570,6 +571,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
 			   int flags, int update_xattr,
 			   struct ceph_inode_xattr **newxattr)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
@@ -626,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
 		xattr->should_free_name = update_xattr;
 
 		ci->i_xattrs.count++;
-		dout("%s count=%d\n", __func__, ci->i_xattrs.count);
+		doutc(cl, "count=%d\n", ci->i_xattrs.count);
 	} else {
 		kfree(*newxattr);
 		*newxattr = NULL;
@@ -654,13 +657,13 @@ static int __set_xattr(struct ceph_inode_info *ci,
 	if (new) {
 		rb_link_node(&xattr->node, parent, p);
 		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
-		dout("%s p=%p\n", __func__, p);
+		doutc(cl, "p=%p\n", p);
 	}
 
-	dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__,
-	     ceph_vinop(&ci->netfs.inode), xattr, name_len, name,
-	     min(val_len, MAX_XATTR_VAL_PRINT_LEN), val,
-	     val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
+	doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode,
+	      ceph_vinop(inode), xattr, name_len, name, min(val_len,
+	      MAX_XATTR_VAL_PRINT_LEN), val,
+	      val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : "");
 
 	return 0;
 }
@@ -668,6 +671,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
 static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 			   const char *name)
 {
+	struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
@@ -688,13 +692,13 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 		else {
 			int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN);
 
-			dout("%s %s: found %.*s%s\n", __func__, name, len,
-			     xattr->val, xattr->val_len > len ? "..." : "");
+			doutc(cl, "%s found %.*s%s\n", name, len, xattr->val,
+			      xattr->val_len > len ? "..." : "");
 			return xattr;
 		}
 	}
 
-	dout("%s %s: not found\n", __func__, name);
+	doutc(cl, "%s not found\n", name);
 
 	return NULL;
 }
@@ -735,19 +739,20 @@ static int __remove_xattr(struct ceph_inode_info *ci,
 static char *__copy_xattr_names(struct ceph_inode_info *ci,
 				char *dest)
 {
+	struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
 	struct rb_node *p;
 	struct ceph_inode_xattr *xattr = NULL;
 
 	p = rb_first(&ci->i_xattrs.index);
-	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+	doutc(cl, "count=%d\n", ci->i_xattrs.count);
 
 	while (p) {
 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
 		memcpy(dest, xattr->name, xattr->name_len);
 		dest[xattr->name_len] = '\0';
 
-		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
-		     xattr->name_len, ci->i_xattrs.names_size);
+		doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		      xattr->name_len, ci->i_xattrs.names_size);
 
 		dest += xattr->name_len + 1;
 		p = rb_next(p);
@@ -758,19 +763,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci,
 
 void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 {
+	struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
 	struct rb_node *p, *tmp;
 	struct ceph_inode_xattr *xattr = NULL;
 
 	p = rb_first(&ci->i_xattrs.index);
 
-	dout("__ceph_destroy_xattrs p=%p\n", p);
+	doutc(cl, "p=%p\n", p);
 
 	while (p) {
 		xattr = rb_entry(p, struct ceph_inode_xattr, node);
 		tmp = p;
 		p = rb_next(tmp);
-		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
-		     xattr->name_len, xattr->name);
+		doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name);
 		rb_erase(tmp, &ci->i_xattrs.index);
 
 		__free_xattr(xattr);
@@ -787,6 +792,7 @@ static int __build_xattrs(struct inode *inode)
 	__releases(ci->i_ceph_lock)
 	__acquires(ci->i_ceph_lock)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	u32 namelen;
 	u32 numattr = 0;
 	void *p, *end;
@@ -798,8 +804,8 @@ static int __build_xattrs(struct inode *inode)
 	int err = 0;
 	int i;
 
-	dout("__build_xattrs() len=%d\n",
-	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+	doutc(cl, "len=%d\n",
+	      ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
 
 	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
 		return 0; /* already built */
@@ -874,6 +880,8 @@ bad:
 static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
 				    int val_size)
 {
+	struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
+
 	/*
 	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
 	 * 4 bytes per each value
@@ -881,9 +889,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
 	int size = 4 + ci->i_xattrs.count*(4 + 4) +
 			     ci->i_xattrs.names_size +
 			     ci->i_xattrs.vals_size;
-	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
-	     ci->i_xattrs.count, ci->i_xattrs.names_size,
-	     ci->i_xattrs.vals_size);
+	doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count,
+	      ci->i_xattrs.names_size, ci->i_xattrs.vals_size);
 
 	if (name_size)
 		size += 4 + 4 + name_size + val_size;
@@ -899,12 +906,14 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
  */
 struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 {
+	struct inode *inode = &ci->netfs.inode;
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct rb_node *p;
 	struct ceph_inode_xattr *xattr = NULL;
 	struct ceph_buffer *old_blob = NULL;
 	void *dest;
 
-	dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
+	doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
 	if (ci->i_xattrs.dirty) {
 		int need = __get_required_blob_size(ci, 0, 0);
 
@@ -962,6 +971,7 @@ static inline int __get_request_mask(struct inode *in) {
 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_inode_xattr *xattr;
 	struct ceph_vxattr *vxattr;
@@ -1000,8 +1010,9 @@ handle_non_vxattrs:
 	req_mask = __get_request_mask(inode);
 
 	spin_lock(&ci->i_ceph_lock);
-	dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name,
-	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+	doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode,
+	      ceph_vinop(inode), name, ci->i_xattrs.version,
+	      ci->i_xattrs.index_version);
 
 	if (ci->i_xattrs.version == 0 ||
 	    !((req_mask & CEPH_CAP_XATTR_SHARED) ||
@@ -1010,8 +1021,9 @@ handle_non_vxattrs:
 
 		/* security module gets xattr while filling trace */
 		if (current->journal_info) {
-			pr_warn_ratelimited("sync getxattr %p "
-					    "during filling trace\n", inode);
+			pr_warn_ratelimited_client(cl,
+				"sync %p %llx.%llx during filling trace\n",
+				inode, ceph_vinop(inode));
 			return -EBUSY;
 		}
 
@@ -1053,14 +1065,16 @@ out:
 ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
 {
 	struct inode *inode = d_inode(dentry);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	bool len_only = (size == 0);
 	u32 namelen;
 	int err;
 
 	spin_lock(&ci->i_ceph_lock);
-	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
-	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+	doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode,
+	      ceph_vinop(inode), ci->i_xattrs.version,
+	      ci->i_xattrs.index_version);
 
 	if (ci->i_xattrs.version == 0 ||
 	    !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) {
@@ -1094,7 +1108,8 @@ out:
 static int ceph_sync_setxattr(struct inode *inode, const char *name,
 			      const char *value, size_t size, int flags)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1119,7 +1134,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 			flags |= CEPH_XATTR_REMOVE;
 	}
 
-	dout("setxattr value size: %zu\n", size);
+	doutc(cl, "name %s value size %zu\n", name, size);
 
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1148,10 +1163,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 	req->r_num_caps = 1;
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 
-	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version);
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 	ceph_mdsc_put_request(req);
-	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+	doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version);
 
 out:
 	if (pagelist)
@@ -1162,9 +1177,10 @@ out:
 int __ceph_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags)
 {
+	struct ceph_client *cl = ceph_inode_to_client(inode);
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
 	struct ceph_cap_flush *prealloc_cf = NULL;
 	struct ceph_buffer *old_blob = NULL;
 	int issued;
@@ -1220,9 +1236,9 @@ retry:
 	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
 	if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
 	    (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
-		dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
-		     __func__, ci->i_xattrs.version, required_blob_size,
-		     mdsc->mdsmap->m_max_xattr_size);
+		doutc(cl, "sync version: %llu size: %d max: %llu\n",
+		      ci->i_xattrs.version, required_blob_size,
+		      mdsc->mdsmap->m_max_xattr_size);
 		goto do_sync;
 	}
 
@@ -1236,8 +1252,8 @@ retry:
 		}
 	}
 
-	dout("setxattr %p name '%s' issued %s\n", inode, name,
-	     ceph_cap_string(issued));
+	doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode,
+	      ceph_vinop(inode), name, ceph_cap_string(issued));
 	__build_xattrs(inode);
 
 	if (!ci->i_xattrs.prealloc_blob ||
@@ -1246,7 +1262,8 @@ retry:
 
 		spin_unlock(&ci->i_ceph_lock);
 		ceph_buffer_put(old_blob); /* Shouldn't be required */
-		dout(" pre-allocating new blob size=%d\n", required_blob_size);
+		doutc(cl, " pre-allocating new blob size=%d\n",
+		      required_blob_size);
 		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto do_sync_unlocked;
@@ -1285,8 +1302,9 @@ do_sync_unlocked:
 
 	/* security module set xattr while filling trace */
 	if (current->journal_info) {
-		pr_warn_ratelimited("sync setxattr %p "
-				    "during filling trace\n", inode);
+		pr_warn_ratelimited_client(cl,
+				"sync %p %llx.%llx during filling trace\n",
+				inode, ceph_vinop(inode));
 		err = -EBUSY;
 	} else {
 		err = ceph_sync_setxattr(inode, name, value, size, flags);
@@ -1446,7 +1464,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
  * List of handlers for synthetic system.* attributes. Other
  * attributes are handled directly.
  */
-const struct xattr_handler *ceph_xattr_handlers[] = {
+const struct xattr_handler * const ceph_xattr_handlers[] = {
 	&ceph_other_xattr_handler,
 	NULL,
 };
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..57cc096c498a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -25,7 +25,7 @@
 
 #include "internal.h"
 
-static struct kobj_map *cdev_map;
+static struct kobj_map *cdev_map __ro_after_init;
 
 static DEFINE_MUTEX(chrdevs_lock);
 
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
 	struct module *owner = p->owner;
 	struct kobject *kobj;
 
-	if (owner && !try_module_get(owner))
+	if (!try_module_get(owner))
 		return NULL;
 	kobj = kobject_get_unless_zero(&p->kobj);
 	if (!kobj)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index ae023853a98f..1d2dac95f86a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -123,9 +123,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
-		inode->i_atime = coda_to_timespec64(attr->va_atime);
+		inode_set_atime_to_ts(inode,
+				      coda_to_timespec64(attr->va_atime));
 	if (attr->va_mtime.tv_sec != -1)
-		inode->i_mtime = coda_to_timespec64(attr->va_mtime);
+		inode_set_mtime_to_ts(inode,
+				      coda_to_timespec64(attr->va_mtime));
         if (attr->va_ctime.tv_sec != -1)
 		inode_set_ctime_to_ts(inode,
 				      coda_to_timespec64(attr->va_ctime));
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cb512b10473b..4e552ba7bd43 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
 	/* optimistically we can also act as if our nose bleeds. The
 	 * granularity of the mtime is coarse anyways so we might actually be
 	 * right most of the time. Note: we only do this for directories. */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 #endif
 }
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 42346618b4ed..16acc58311ea 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
 	coda_inode->i_size = file_inode(host_file)->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
-	coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
+	inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
 	inode_unlock(coda_inode);
 	file_end_write(host_file);
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fbdcb3582926..dcc22f593e43 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,7 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 }
 
 static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -96,8 +96,8 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
 	inode->i_mode = iattr->ia_mode;
 	inode->i_uid = iattr->ia_uid;
 	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = iattr->ia_atime;
-	inode->i_mtime = iattr->ia_mtime;
+	inode_set_atime_to_ts(inode, iattr->ia_atime);
+	inode_set_mtime_to_ts(inode, iattr->ia_mtime);
 	inode_set_ctime_to_ts(inode, iattr->ia_ctime);
 }
 
@@ -171,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
 		return ERR_PTR(-ENOMEM);
 
 	p_inode = d_inode(dentry->d_parent);
-	p_inode->i_mtime = inode_set_ctime_current(p_inode);
+	inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode));
 	configfs_set_inode_lock_class(sd, inode);
 	return inode;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 5ee7d7bbb361..60dbfa0f8805 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,8 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	}
 
 	/* Struct copy intentional */
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								zerotime);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime)));
 	/* inode->i_nlink is left 1 - arguably wrong for directories,
 	   but it's the best we can do without reading the directory
 	   contents.  1 yields the right result in GNU find, even
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 	kfree(sbi);
 }
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 62e1a3dd8357..0ad8c30b8fa5 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,10 +111,14 @@ out:
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len)
 {
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
-	const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
+	const unsigned int du_per_page = 1U << du_per_page_bits;
+	u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits);
+	u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits);
+	sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT);
 	struct page *pages[16]; /* write up to 16 pages at a time */
 	unsigned int nr_pages;
 	unsigned int i;
@@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 							  len);
 
 	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
-	nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
-			 (len + blocks_per_page - 1) >> blocks_per_page_bits);
+	nr_pages = min_t(u64, ARRAY_SIZE(pages),
+			 (du_remaining + du_per_page - 1) >> du_per_page_bits);
 
 	/*
 	 * We need at least one page for ciphertext.  Allocate the first one
@@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 	bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
 
 	do {
-		bio->bi_iter.bi_sector = pblk << (blockbits - 9);
+		bio->bi_iter.bi_sector = sector;
 
 		i = 0;
 		offset = 0;
 		do {
-			err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
-						  ZERO_PAGE(0), pages[i],
-						  blocksize, offset, GFP_NOFS);
+			err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
+						      ZERO_PAGE(0), pages[i],
+						      du_size, offset,
+						      GFP_NOFS);
 			if (err)
 				goto out;
-			lblk++;
-			pblk++;
-			len--;
-			offset += blocksize;
-			if (offset == PAGE_SIZE || len == 0) {
+			du_index++;
+			sector += 1U << (du_bits - SECTOR_SHIFT);
+			du_remaining--;
+			offset += du_size;
+			if (offset == PAGE_SIZE || du_remaining == 0) {
 				ret = bio_add_page(bio, pages[i++], offset, 0);
 				if (WARN_ON_ONCE(ret != offset)) {
 					err = -EIO;
@@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				}
 				offset = 0;
 			}
-		} while (i != nr_pages && len != 0);
+		} while (i != nr_pages && du_remaining != 0);
 
 		err = submit_bio_wait(bio);
 		if (err)
 			goto out;
 		bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
-	} while (len != 0);
+	} while (du_remaining != 0);
 	err = 0;
 out:
 	bio_put(bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..328470d40dec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
 
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
 {
+	if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
+		/*
+		 * Oops, the filesystem called a function that uses the bounce
+		 * page pool, but it didn't set needs_bounce_pages.
+		 */
+		return NULL;
+	}
 	return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 }
 
@@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
 EXPORT_SYMBOL(fscrypt_free_bounce_page);
 
 /*
- * Generate the IV for the given logical block number within the given file.
- * For filenames encryption, lblk_num == 0.
+ * Generate the IV for the given data unit index within the given file.
+ * For filenames encryption, index == 0.
  *
  * Keep this in sync with fscrypt_limit_io_blocks().  fscrypt_limit_io_blocks()
  * needs to know about any IV generation methods where the low bits of IV don't
- * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ * simply contain the data unit index (e.g., IV_INO_LBLK_32).
  */
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-			 const struct fscrypt_info *ci)
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+			 const struct fscrypt_inode_info *ci)
 {
 	u8 flags = fscrypt_policy_flags(&ci->ci_policy);
 
 	memset(iv, 0, ci->ci_mode->ivsize);
 
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
+		WARN_ON_ONCE(index > U32_MAX);
 		WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
-		lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+		index |= (u64)ci->ci_inode->i_ino << 32;
 	} else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
-		WARN_ON_ONCE(lblk_num > U32_MAX);
-		lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
+		WARN_ON_ONCE(index > U32_MAX);
+		index = (u32)(ci->ci_hashed_ino + index);
 	} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
 		memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
 	}
-	iv->lblk_num = cpu_to_le64(lblk_num);
+	iv->index = cpu_to_le64(index);
 }
 
-/* Encrypt or decrypt a single filesystem block of file contents */
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags)
+/* Encrypt or decrypt a single "data unit" of file contents. */
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags)
 {
 	union fscrypt_iv iv;
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist dst, src;
-	struct fscrypt_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	int res = 0;
 
@@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 	if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
 		return -EINVAL;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req)
@@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
 		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
 	skcipher_request_free(req);
 	if (res) {
-		fscrypt_err(inode, "%scryption failed for block %llu: %d",
-			    (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+		fscrypt_err(ci->ci_inode,
+			    "%scryption failed for data unit %llu: %d",
+			    (rw == FS_DECRYPT ? "De" : "En"), index, res);
 		return res;
 	}
 	return 0;
 }
 
 /**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
- *					pagecache page
- * @page:      The locked pagecache page containing the block(s) to encrypt
- * @len:       Total size of the block(s) to encrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @page of the first block to encrypt.  Must be
- *		a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags.  See details below.
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
+ * @page: the locked pagecache page containing the data to encrypt
+ * @len: size of the data to encrypt, in bytes
+ * @offs: offset within @page of the data to encrypt, in bytes
+ * @gfp_flags: memory allocation flags; see details below
+ *
+ * This allocates a new bounce page and encrypts the given data into it.  The
+ * length and offset of the data must be aligned to the file's crypto data unit
+ * size.  Alignment to the filesystem block size fulfills this requirement, as
+ * the filesystem block size is always a multiple of the data unit size.
  *
- * A new bounce page is allocated, and the specified block(s) are encrypted into
- * it.  In the bounce page, the ciphertext block(s) will be located at the same
- * offsets at which the plaintext block(s) were located in the source page; any
- * other parts of the bounce page will be left uninitialized.  However, normally
- * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ * In the bounce page, the ciphertext data will be located at the same offset at
+ * which the plaintext data was located in the source page.  Any other parts of
+ * the bounce page will be left uninitialized.
  *
  * This is for use by the filesystem's ->writepages() method.
  *
@@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
 
 {
 	const struct inode *inode = page->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
 	struct page *ciphertext_page;
-	u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	unsigned int i;
 	int err;
 
 	if (WARN_ON_ONCE(!PageLocked(page)))
 		return ERR_PTR(-EINVAL);
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return ERR_PTR(-EINVAL);
 
 	ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
 	if (!ciphertext_page)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
-		err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
-					  page, ciphertext_page,
-					  blocksize, i, gfp_flags);
+	for (i = offs; i < offs + len; i += du_size, index++) {
+		err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
+					      page, ciphertext_page,
+					      du_size, i, gfp_flags);
 		if (err) {
 			fscrypt_free_bounce_page(ciphertext_page);
 			return ERR_PTR(err);
@@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num, gfp_t gfp_flags)
 {
-	return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
-				   len, offs, gfp_flags);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
+				       lblk_num, page, page, len, offs,
+				       gfp_flags);
 }
 EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
 
 /**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- *					pagecache folio
- * @folio:     The locked pagecache folio containing the block(s) to decrypt
- * @len:       Total size of the block(s) to decrypt.  Must be a nonzero
- *		multiple of the filesystem's block size.
- * @offs:      Byte offset within @folio of the first block to decrypt.  Must be
- *		a multiple of the filesystem's block size.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
+ * @folio: the pagecache folio containing the data to decrypt
+ * @len: size of the data to decrypt, in bytes
+ * @offs: offset within @folio of the data to decrypt, in bytes
  *
- * The specified block(s) are decrypted in-place within the pagecache folio,
- * which must still be locked and not uptodate.
- *
- * This is for use by the filesystem's ->readahead() method.
+ * Decrypt data that has just been read from an encrypted file.  The data must
+ * be located in a pagecache folio that is still locked and not yet uptodate.
+ * The length and offset of the data must be aligned to the file's crypto data
+ * unit size.  Alignment to the filesystem block size fulfills this requirement,
+ * as the filesystem block size is always a multiple of the data unit size.
  *
  * Return: 0 on success; -errno on failure
  */
@@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs)
 {
 	const struct inode *inode = folio->mapping->host;
-	const unsigned int blockbits = inode->i_blkbits;
-	const unsigned int blocksize = 1 << blockbits;
-	u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
-		       (offs >> blockbits);
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+	const unsigned int du_bits = ci->ci_data_unit_bits;
+	const unsigned int du_size = 1U << du_bits;
+	u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
+		    (offs >> du_bits);
 	size_t i;
 	int err;
 
 	if (WARN_ON_ONCE(!folio_test_locked(folio)))
 		return -EINVAL;
 
-	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+	if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
 		return -EINVAL;
 
-	for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+	for (i = offs; i < offs + len; i += du_size, index++) {
 		struct page *page = folio_page(folio, i >> PAGE_SHIFT);
 
-		err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
-					  page, blocksize, i & ~PAGE_MASK,
-					  GFP_NOFS);
+		err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
+					      page, du_size, i & ~PAGE_MASK,
+					      GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
  * and @lblk_num must be specified, as they can't be determined from @page.
  *
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
  * Return: 0 on success; -errno on failure
  */
 int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
 				  u64 lblk_num)
 {
-	return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
-				   len, offs, GFP_NOFS);
+	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+		return -EOPNOTSUPP;
+	return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
+				       lblk_num, page, page, len, offs,
+				       GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
 
@@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb)
 		return 0;
 
 	/* No need to allocate a bounce page pool if this FS won't use it. */
-	if (sb->s_cop->flags & FS_CFLG_OWN_PAGES)
+	if (!sb->s_cop->needs_bounce_pages)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
@@ -391,18 +409,19 @@ static int __init fscrypt_init(void)
 	if (!fscrypt_read_workqueue)
 		goto fail;
 
-	fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
-	if (!fscrypt_info_cachep)
+	fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+					       SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_inode_info_cachep)
 		goto fail_free_queue;
 
 	err = fscrypt_init_keyring();
 	if (err)
-		goto fail_free_info;
+		goto fail_free_inode_info;
 
 	return 0;
 
-fail_free_info:
-	kmem_cache_destroy(fscrypt_info_cachep);
+fail_free_inode_info:
+	kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
 	destroy_workqueue(fscrypt_read_workqueue);
 fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 {
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
 	struct skcipher_request *req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct scatterlist src_sg, dst_sg;
-	const struct fscrypt_info *ci = inode->i_crypt_info;
+	const struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
 	union fscrypt_iv iv;
 	int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-	const struct fscrypt_info *ci = dir->i_crypt_info;
+	const struct fscrypt_inode_info *ci = dir->i_crypt_info;
 
 	WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..1892356cf924 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -47,7 +47,8 @@ struct fscrypt_context_v2 {
 	u8 contents_encryption_mode;
 	u8 filenames_encryption_mode;
 	u8 flags;
-	u8 __reserved[4];
+	u8 log2_data_unit_size;
+	u8 __reserved[3];
 	u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
 	u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
 };
@@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
 	BUG();
 }
 
+static inline int
+fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
+			  const struct inode *inode)
+{
+	return policy->log2_data_unit_size ?: inode->i_blkbits;
+}
+
+static inline int
+fscrypt_policy_du_bits(const union fscrypt_policy *policy,
+		       const struct inode *inode)
+{
+	switch (policy->version) {
+	case FSCRYPT_POLICY_V1:
+		return inode->i_blkbits;
+	case FSCRYPT_POLICY_V2:
+		return fscrypt_policy_v2_du_bits(&policy->v2, inode);
+	}
+	BUG();
+}
+
 /*
  * For encrypted symlinks, the ciphertext length is stored at the beginning
  * of the string in little-endian format.
@@ -189,18 +210,18 @@ struct fscrypt_prepared_key {
 };
 
 /*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct is
  * allocated and stored in ->i_crypt_info.  Once created, it remains until the
  * inode is evicted.
  */
-struct fscrypt_info {
+struct fscrypt_inode_info {
 
 	/* The key in a form prepared for actual encryption/decryption */
 	struct fscrypt_prepared_key ci_enc_key;
 
-	/* True if ci_enc_key should be freed when this fscrypt_info is freed */
+	/* True if ci_enc_key should be freed when this struct is freed */
 	bool ci_owns_key;
 
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -212,6 +233,16 @@ struct fscrypt_info {
 #endif
 
 	/*
+	 * log2 of the data unit size (granularity of contents encryption) of
+	 * this file.  This is computable from ci_policy and ci_inode but is
+	 * cached here for efficiency.  Only used for regular files.
+	 */
+	u8 ci_data_unit_bits;
+
+	/* Cached value: log2 of number of data units per FS block */
+	u8 ci_data_units_per_block_bits;
+
+	/*
 	 * Encryption mode used for this inode.  It corresponds to either the
 	 * contents or filenames encryption mode, depending on the inode type.
 	 */
@@ -263,12 +294,13 @@ typedef enum {
 } fscrypt_direction_t;
 
 /* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
+extern struct kmem_cache *fscrypt_inode_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
-			u64 lblk_num, struct page *src_page,
-			struct page *dest_page, unsigned int len,
-			unsigned int offs, gfp_t gfp_flags);
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+			    fscrypt_direction_t rw, u64 index,
+			    struct page *src_page, struct page *dest_page,
+			    unsigned int len, unsigned int offs,
+			    gfp_t gfp_flags);
 struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
 
 void __printf(3, 4) __cold
@@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
 
 union fscrypt_iv {
 	struct {
-		/* logical block number within the file */
-		__le64 lblk_num;
+		/* zero-based index of data unit within the file */
+		__le64 index;
 
 		/* per-file nonce; only set in DIRECT_KEY mode */
 		u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
@@ -293,8 +325,18 @@ union fscrypt_iv {
 	__le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
 };
 
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-			 const struct fscrypt_info *ci);
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+			 const struct fscrypt_inode_info *ci);
+
+/*
+ * Return the number of bits used by the maximum file data unit index that is
+ * possible on the given filesystem, using the given log2 data unit size.
+ */
+static inline int
+fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
+{
+	return fls64(sb->s_maxbytes - 1) - du_bits;
+}
 
 /* fname.c */
 bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
@@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
 
 /* inline_crypt.c */
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return ci->ci_inlinecrypt;
 }
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci);
+				     const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 				      struct fscrypt_prepared_key *prep_key);
@@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
  */
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	/*
 	 * The two smp_load_acquire()'s here pair with the smp_store_release()'s
@@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
 
 #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
 
-static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	return 0;
 }
 
 static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
 {
 	return false;
 }
@@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
 static inline int
 fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				 const u8 *raw_key,
-				 const struct fscrypt_info *ci)
+				 const struct fscrypt_inode_info *ci)
 {
 	WARN_ON_ONCE(1);
 	return -EOPNOTSUPP;
@@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 
 static inline bool
 fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
-			const struct fscrypt_info *ci)
+			const struct fscrypt_inode_info *ci)
 {
 	return smp_load_acquire(&prep_key->tfm) != NULL;
 }
@@ -433,8 +475,28 @@ struct fscrypt_master_key_secret {
  * fscrypt_master_key - an in-use master key
  *
  * This represents a master encryption key which has been added to the
- * filesystem and can be used to "unlock" the encrypted files which were
- * encrypted with it.
+ * filesystem.  There are three high-level states that a key can be in:
+ *
+ * FSCRYPT_KEY_STATUS_PRESENT
+ *	Key is fully usable; it can be used to unlock inodes that are encrypted
+ *	with it (this includes being able to create new inodes).  ->mk_present
+ *	indicates whether the key is in this state.  ->mk_secret exists, the key
+ *	is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
+ *
+ * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
+ *	Removal of this key has been initiated, but some inodes that were
+ *	unlocked with it are still in-use.  Like ABSENT, ->mk_secret is wiped,
+ *	and the key can no longer be used to unlock inodes.  Unlike ABSENT, the
+ *	key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
+ *	->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
+ *
+ *	This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
+ *	or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
+ *
+ * FSCRYPT_KEY_STATUS_ABSENT
+ *	Key is fully removed.  The key is no longer in the keyring,
+ *	->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
+ *	wiped, and the key can no longer be used to unlock inodes.
  */
 struct fscrypt_master_key {
 
@@ -444,7 +506,7 @@ struct fscrypt_master_key {
 	 */
 	struct hlist_node			mk_node;
 
-	/* Semaphore that protects ->mk_secret and ->mk_users */
+	/* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
 	struct rw_semaphore			mk_sem;
 
 	/*
@@ -454,8 +516,8 @@ struct fscrypt_master_key {
 	 * ->mk_direct_keys) that have been prepared continue to exist.
 	 * A structural ref only guarantees that the struct continues to exist.
 	 *
-	 * There is one active ref associated with ->mk_secret being present,
-	 * and one active ref for each inode in ->mk_decrypted_inodes.
+	 * There is one active ref associated with ->mk_present being true, and
+	 * one active ref for each inode in ->mk_decrypted_inodes.
 	 *
 	 * There is one structural ref associated with the active refcount being
 	 * nonzero.  Finding a key in the keyring also takes a structural ref,
@@ -467,17 +529,10 @@ struct fscrypt_master_key {
 	struct rcu_head				mk_rcu_head;
 
 	/*
-	 * The secret key material.  After FS_IOC_REMOVE_ENCRYPTION_KEY is
-	 * executed, this is wiped and no new inodes can be unlocked with this
-	 * key; however, there may still be inodes in ->mk_decrypted_inodes
-	 * which could not be evicted.  As long as some inodes still remain,
-	 * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
-	 * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+	 * The secret key material.  Wiped as soon as it is no longer needed;
+	 * for details, see the fscrypt_master_key struct comment.
 	 *
-	 * While ->mk_secret is present, one ref in ->mk_active_refs is held.
-	 *
-	 * Locking: protected by ->mk_sem.  The manipulation of ->mk_active_refs
-	 *	    associated with this field is protected by ->mk_sem as well.
+	 * Locking: protected by ->mk_sem.
 	 */
 	struct fscrypt_master_key_secret	mk_secret;
 
@@ -500,7 +555,7 @@ struct fscrypt_master_key {
 	 *
 	 * Locking: protected by ->mk_sem.  (We don't just rely on the keyrings
 	 * subsystem semaphore ->mk_users->sem, as we need support for atomic
-	 * search+insert along with proper synchronization with ->mk_secret.)
+	 * search+insert along with proper synchronization with other fields.)
 	 */
 	struct key		*mk_users;
 
@@ -523,20 +578,17 @@ struct fscrypt_master_key {
 	siphash_key_t		mk_ino_hash_key;
 	bool			mk_ino_hash_key_initialized;
 
-} __randomize_layout;
-
-static inline bool
-is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
-{
 	/*
-	 * The READ_ONCE() is only necessary for fscrypt_drop_inode().
-	 * fscrypt_drop_inode() runs in atomic context, so it can't take the key
-	 * semaphore and thus 'secret' can change concurrently which would be a
-	 * data race.  But fscrypt_drop_inode() only need to know whether the
-	 * secret *was* present at the time of check, so READ_ONCE() suffices.
+	 * Whether this key is in the "present" state, i.e. fully usable.  For
+	 * details, see the fscrypt_master_key struct comment.
+	 *
+	 * Locking: protected by ->mk_sem, but can be read locklessly using
+	 * READ_ONCE().  Writers must use WRITE_ONCE() when concurrent readers
+	 * are possible.
 	 */
-	return READ_ONCE(secret->size) != 0;
-}
+	bool			mk_present;
+
+} __randomize_layout;
 
 static inline const char *master_key_spec_type(
 				const struct fscrypt_key_specifier *spec)
@@ -598,17 +650,18 @@ struct fscrypt_mode {
 extern struct fscrypt_mode fscrypt_modes[];
 
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci);
+			const u8 *raw_key, const struct fscrypt_inode_info *ci);
 
 void fscrypt_destroy_prepared_key(struct super_block *sb,
 				  struct fscrypt_prepared_key *prep_key);
 
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key);
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk);
 
 int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
@@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode)
 
 void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
 			      const u8 *raw_master_key);
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+				struct fscrypt_inode_info *ci);
 
 /* policy.c */
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 6238dbcadcad..52504dd478d3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
 int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct fscrypt_master_key *mk;
 	int err;
 
@@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
 			return -EINVAL;
 		mk = ci->ci_master_key;
 		down_read(&mk->mk_sem);
-		if (is_master_key_secret_present(&mk->mk_secret))
+		if (mk->mk_present)
 			err = fscrypt_derive_dirhash_key(ci, mk);
 		else
 			err = -ENOKEY;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8bfb3ce86476..b4002aea7cdb 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb,
 	return devs;
 }
 
-static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci)
 {
-	struct super_block *sb = ci->ci_inode->i_sb;
+	const struct super_block *sb = ci->ci_inode->i_sb;
 	unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
-	int ino_bits = 64, lblk_bits = 64;
+	int dun_bits;
 
 	if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return offsetofend(union fscrypt_iv, nonce);
@@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
 	if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
 		return sizeof(__le32);
 
-	/* Default case: IVs are just the file logical block number */
-	if (sb->s_cop->get_ino_and_lblk_bits)
-		sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
-	return DIV_ROUND_UP(lblk_bits, 8);
+	/* Default case: IVs are just the file data unit index */
+	dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits);
+	return DIV_ROUND_UP(dun_bits, 8);
 }
 
 /*
@@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 }
 
 /* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 	 * crypto configuration that the file would use.
 	 */
 	crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
-	crypto_cfg.data_unit_size = sb->s_blocksize;
+	crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
 	crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
 
 	devs = fscrypt_get_devices(sb, &num_devs);
@@ -152,7 +151,7 @@ out_free_devs:
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
-				     const struct fscrypt_info *ci)
+				     const struct fscrypt_inode_info *ci)
 {
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
@@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		return -ENOMEM;
 
 	err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
-				  fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+				  fscrypt_get_dun_bytes(ci),
+				  1U << ci->ci_data_unit_bits);
 	if (err) {
 		fscrypt_err(inode, "error %d initializing blk-crypto key", err);
 		goto fail;
@@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
 
-static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
+				 u64 lblk_num,
 				 u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
 {
+	u64 index = lblk_num << ci->ci_data_units_per_block_bits;
 	union fscrypt_iv iv;
 	int i;
 
-	fscrypt_generate_iv(&iv, lblk_num, ci);
+	fscrypt_generate_iv(&iv, index, ci);
 
 	BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
 	memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
@@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
 void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 			       u64 first_lblk, gfp_t gfp_mask)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
@@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
  */
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	u32 dun;
 
 	if (!fscrypt_inode_uses_inline_crypto(inode))
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7cbb1fd872ac..f34a9b0b9e92 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
 	spin_unlock(&sb->s_master_keys->lock);
 
 	/*
-	 * ->mk_active_refs == 0 implies that ->mk_secret is not present and
-	 * that ->mk_decrypted_inodes is empty.
+	 * ->mk_active_refs == 0 implies that ->mk_present is false and
+	 * ->mk_decrypted_inodes is empty.
 	 */
-	WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret));
+	WARN_ON_ONCE(mk->mk_present);
 	WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes));
 
 	for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
@@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
 	fscrypt_put_master_key(mk);
 }
 
+/*
+ * This transitions the key state from present to incompletely removed, and then
+ * potentially to absent (depending on whether inodes remain).
+ */
+static void fscrypt_initiate_key_removal(struct super_block *sb,
+					 struct fscrypt_master_key *mk)
+{
+	WRITE_ONCE(mk->mk_present, false);
+	wipe_master_key_secret(&mk->mk_secret);
+	fscrypt_put_master_key_activeref(sb, mk);
+}
+
 static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
 {
 	if (spec->__reserved)
@@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb)
 			 * evicted, every key remaining in the keyring should
 			 * have an empty inode list, and should only still be in
 			 * the keyring due to the single active ref associated
-			 * with ->mk_secret.  There should be no structural refs
-			 * beyond the one associated with the active ref.
+			 * with ->mk_present.  There should be no structural
+			 * refs beyond the one associated with the active ref.
 			 */
 			WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1);
 			WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1);
-			WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret));
-			wipe_master_key_secret(&mk->mk_secret);
-			fscrypt_put_master_key_activeref(sb, mk);
+			WARN_ON_ONCE(!mk->mk_present);
+			fscrypt_initiate_key_removal(sb, mk);
 		}
 	}
 	kfree_sensitive(keyring);
@@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb,
 	}
 
 	move_master_key_secret(&mk->mk_secret, secret);
-	refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+	mk->mk_present = true;
+	refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */
 
 	spin_lock(&keyring->lock);
 	hlist_add_head_rcu(&mk->mk_node,
@@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
 			return err;
 	}
 
-	/* Re-add the secret if needed. */
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
-		if (!refcount_inc_not_zero(&mk->mk_active_refs))
+	/* If the key is incompletely removed, make it present again. */
+	if (!mk->mk_present) {
+		if (!refcount_inc_not_zero(&mk->mk_active_refs)) {
+			/*
+			 * Raced with the last active ref being dropped, so the
+			 * key has become, or is about to become, "absent".
+			 * Therefore, we need to allocate a new key struct.
+			 */
 			return KEY_DEAD;
+		}
 		move_master_key_secret(&mk->mk_secret, secret);
+		WRITE_ONCE(mk->mk_present, true);
 	}
 
 	return 0;
@@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb,
 			err = add_new_master_key(sb, secret, mk_spec);
 	} else {
 		/*
-		 * Found the key in ->s_master_keys.  Re-add the secret if
-		 * needed, and add the user to ->mk_users if needed.
+		 * Found the key in ->s_master_keys.  Add the user to ->mk_users
+		 * if needed, and make the key "present" again if possible.
 		 */
 		down_write(&mk->mk_sem);
 		err = add_existing_master_key(mk, secret);
@@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode)
 
 static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
 {
-	struct fscrypt_info *ci;
+	struct fscrypt_inode_info *ci;
 	struct inode *inode;
 	struct inode *toput_inode = NULL;
 
@@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb,
 		/* select an example file to show for debugging purposes */
 		struct inode *inode =
 			list_first_entry(&mk->mk_decrypted_inodes,
-					 struct fscrypt_info,
+					 struct fscrypt_inode_info,
 					 ci_master_key_link)->ci_inode;
 		ino = inode->i_ino;
 	}
@@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb,
  *
  * If all inodes were evicted, then we unlink the fscrypt_master_key from the
  * keyring.  Otherwise it remains in the keyring in the "incompletely removed"
- * state (without the actual secret key) where it tracks the list of remaining
- * inodes.  Userspace can execute the ioctl again later to retry eviction, or
- * alternatively can re-add the secret key again.
+ * state where it tracks the list of remaining inodes.  Userspace can execute
+ * the ioctl again later to retry eviction, or alternatively can re-add the key.
  *
  * For more details, see the "Removing keys" section of
  * Documentation/filesystems/fscrypt.rst.
@@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 		}
 	}
 
-	/* No user claims remaining.  Go ahead and wipe the secret. */
+	/* No user claims remaining.  Initiate removal of the key. */
 	err = -ENOKEY;
-	if (is_master_key_secret_present(&mk->mk_secret)) {
-		wipe_master_key_secret(&mk->mk_secret);
-		fscrypt_put_master_key_activeref(sb, mk);
+	if (mk->mk_present) {
+		fscrypt_initiate_key_removal(sb, mk);
 		err = 0;
 	}
 	inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
@@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	}
 	/*
 	 * We return 0 if we successfully did something: removed a claim to the
-	 * key, wiped the secret, or tried locking the files again.  Users need
-	 * to check the informational status flags if they care whether the key
-	 * has been fully removed including all files locked.
+	 * key, initiated removal of the key, or tried locking the files again.
+	 * Users need to check the informational status flags if they care
+	 * whether the key has been fully removed including all files locked.
 	 */
 out_put_key:
 	fscrypt_put_master_key(mk);
@@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
  * Retrieve the status of an fscrypt master encryption key.
  *
  * We set ->status to indicate whether the key is absent, present, or
- * incompletely removed.  "Incompletely removed" means that the master key
- * secret has been removed, but some files which had been unlocked with it are
- * still in use.  This field allows applications to easily determine the state
- * of an encrypted directory without using a hack such as trying to open a
- * regular file in it (which can confuse the "incompletely removed" state with
- * absent or present).
+ * incompletely removed.  (For an explanation of what these statuses mean and
+ * how they are represented internally, see struct fscrypt_master_key.)  This
+ * field allows applications to easily determine the status of an encrypted
+ * directory without using a hack such as trying to open a regular file in it
+ * (which can confuse the "incompletely removed" status with absent or present).
  *
  * In addition, for v2 policy keys we allow applications to determine, via
  * ->status_flags and ->user_count, whether the key has been added by the
@@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
 	}
 	down_read(&mk->mk_sem);
 
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
+	if (!mk->mk_present) {
 		arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
 			FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
 			FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 361f41ef46c7..d71f7c799e79 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -148,7 +148,7 @@ err_free_tfm:
  * and IV generation method (@ci->ci_policy.flags).
  */
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci)
+			const u8 *raw_key, const struct fscrypt_inode_info *ci)
 {
 	struct crypto_skcipher *tfm;
 
@@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
 }
 
 /* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+				 const u8 *raw_key)
 {
 	ci->ci_owns_key = true;
 	return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
 }
 
-static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
 				  struct fscrypt_master_key *mk,
 				  struct fscrypt_prepared_key *keys,
 				  u8 hkdf_context, bool include_fs_uuid)
@@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
 	return 0;
 }
 
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	int err;
@@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
 	return 0;
 }
 
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
 			       const struct fscrypt_master_key *mk)
 {
 	WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
@@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci,
 					      &mk->mk_ino_hash_key);
 }
 
-static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
 					    struct fscrypt_master_key *mk)
 {
 	int err;
@@ -329,7 +330,7 @@ unlock:
 	return 0;
 }
 
-static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
 				     struct fscrypt_master_key *mk,
 				     bool need_dirhash_key)
 {
@@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
  * still allow 512-bit master keys if the user chooses to use them, though.)
  */
 static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
-					  const struct fscrypt_info *ci)
+					  const struct fscrypt_inode_info *ci)
 {
 	unsigned int min_keysize;
 
@@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
  *
  * If the master key is found in the filesystem-level keyring, then it is
  * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
- * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
- * multiple tasks may race to create an fscrypt_info for the same inode), and to
- * synchronize the master key being removed with a new inode starting to use it.
+ * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
+ * (as multiple tasks may race to create an fscrypt_inode_info for the same
+ * inode), and to synchronize the master key being removed with a new inode
+ * starting to use it.
  */
-static int setup_file_encryption_key(struct fscrypt_info *ci,
+static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
 				     bool need_dirhash_key,
 				     struct fscrypt_master_key **mk_ret)
 {
@@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
 	}
 	down_read(&mk->mk_sem);
 
-	/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
-	if (!is_master_key_secret_present(&mk->mk_secret)) {
+	if (!mk->mk_present) {
+		/* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
 		err = -ENOKEY;
 		goto out_release_key;
 	}
@@ -519,7 +521,7 @@ out_release_key:
 	return err;
 }
 
-static void put_crypt_info(struct fscrypt_info *ci)
+static void put_crypt_info(struct fscrypt_inode_info *ci)
 {
 	struct fscrypt_master_key *mk;
 
@@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		/*
 		 * Remove this inode from the list of inodes that were unlocked
 		 * with the master key.  In addition, if we're removing the last
-		 * inode from a master key struct that already had its secret
-		 * removed, then complete the full removal of the struct.
+		 * inode from an incompletely removed key, then complete the
+		 * full removal of the key.
 		 */
 		spin_lock(&mk->mk_decrypted_inodes_lock);
 		list_del(&ci->ci_master_key_link);
@@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
-	kmem_cache_free(fscrypt_info_cachep, ci);
+	kmem_cache_free(fscrypt_inode_info_cachep, ci);
 }
 
 static int
@@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 			      const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
 			      bool need_dirhash_key)
 {
-	struct fscrypt_info *crypt_info;
+	struct fscrypt_inode_info *crypt_info;
 	struct fscrypt_mode *mode;
 	struct fscrypt_master_key *mk = NULL;
 	int res;
@@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	if (res)
 		return res;
 
-	crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
+	crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
 	if (!crypt_info)
 		return -ENOMEM;
 
@@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
 	crypt_info->ci_mode = mode;
 
+	crypt_info->ci_data_unit_bits =
+		fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
+	crypt_info->ci_data_units_per_block_bits =
+		inode->i_blkbits - crypt_info->ci_data_unit_bits;
+
 	res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
 	if (res)
 		goto out;
@@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	/*
 	 * For existing inodes, multiple tasks may race to set ->i_crypt_info.
 	 * So use cmpxchg_release().  This pairs with the smp_load_acquire() in
-	 * fscrypt_get_info().  I.e., here we publish ->i_crypt_info with a
-	 * RELEASE barrier so that other tasks can ACQUIRE it.
+	 * fscrypt_get_inode_info().  I.e., here we publish ->i_crypt_info with
+	 * a RELEASE barrier so that other tasks can ACQUIRE it.
 	 */
 	if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
 		/*
@@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
  * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
  * @inode: an inode being evicted
  *
- * Free the inode's fscrypt_info.  Filesystems must call this when the inode is
- * being evicted.  An RCU grace period need not have elapsed yet.
+ * Free the inode's fscrypt_inode_info.  Filesystems must call this when the
+ * inode is being evicted.  An RCU grace period need not have elapsed yet.
  */
 void fscrypt_put_encryption_info(struct inode *inode)
 {
@@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
  */
 int fscrypt_drop_inode(struct inode *inode)
 {
-	const struct fscrypt_info *ci = fscrypt_get_info(inode);
+	const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);
 
 	/*
 	 * If ci is NULL, then the inode doesn't have an encryption key set up
@@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode)
 		return 0;
 
 	/*
-	 * Note: since we aren't holding the key semaphore, the result here can
+	 * We can't take ->mk_sem here, since this runs in atomic context.
+	 * Therefore, ->mk_present can change concurrently, and our result may
 	 * immediately become outdated.  But there's no correctness problem with
 	 * unnecessarily evicting.  Nor is there a correctness problem with not
 	 * evicting while iput() is racing with the key being removed, since
 	 * then the thread removing the key will either evict the inode itself
 	 * or will correctly detect that it wasn't evicted due to the race.
 	 */
-	return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
+	return !READ_ONCE(ci->ci_master_key->mk_present);
 }
 EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 75dabd9b27f9..cf3b58ec32cc 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -20,8 +20,8 @@
  *    managed alongside the master keys in the filesystem-level keyring)
  */
 
-#include <crypto/algapi.h>
 #include <crypto/skcipher.h>
+#include <crypto/utils.h>
 #include <keys/user-type.h>
 #include <linux/hashtable.h>
 #include <linux/scatterlist.h>
@@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
  */
 static struct fscrypt_direct_key *
 find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
-			  const u8 *raw_key, const struct fscrypt_info *ci)
+			  const u8 *raw_key,
+			  const struct fscrypt_inode_info *ci)
 {
 	unsigned long hash_key;
 	struct fscrypt_direct_key *dk;
@@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
 
 /* Prepare to encrypt directly using the master key in the given mode */
 static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key)
 {
 	struct fscrypt_direct_key *dk;
 	int err;
@@ -250,7 +251,7 @@ err_free_dk:
 }
 
 /* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci,
 				    const u8 *raw_master_key)
 {
 	struct fscrypt_direct_key *dk;
@@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
 }
 
 /* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci,
 				     const u8 *raw_master_key)
 {
 	u8 *derived_key;
@@ -289,7 +290,8 @@ out:
 	return err;
 }
 
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
+			      const u8 *raw_master_key)
 {
 	if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
 		return setup_v1_file_key_direct(ci, raw_master_key);
@@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
 		return setup_v1_file_key_derived(ci, raw_master_key);
 }
 
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+int
+fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci)
 {
+	const struct super_block *sb = ci->ci_inode->i_sb;
 	struct key *key;
 	const struct fscrypt_key *payload;
 	int err;
@@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
 	key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
 					ci->ci_policy.v1.master_key_descriptor,
 					ci->ci_mode->keysize, &payload);
-	if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
-		key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+	if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) {
+		key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix,
 						ci->ci_policy.v1.master_key_descriptor,
 						ci->ci_mode->keysize, &payload);
 	}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f4456ecb3f87..701259991277 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode,
 }
 
 static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
-					 const struct inode *inode,
-					 const char *type,
-					 int max_ino_bits, int max_lblk_bits)
+					 const struct inode *inode)
 {
+	const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+				? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
 	struct super_block *sb = inode->i_sb;
-	int ino_bits = 64, lblk_bits = 64;
 
 	/*
 	 * IV_INO_LBLK_* exist only because of hardware limitations, and
@@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
 			     type, sb->s_id);
 		return false;
 	}
-	if (sb->s_cop->get_ino_and_lblk_bits)
-		sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
-	if (ino_bits > max_ino_bits) {
+
+	/*
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
+	 * in 32 bits.  In principle, IV_INO_LBLK_32 could support longer inode
+	 * numbers because it hashes the inode number; however, currently the
+	 * inode number is gotten from inode::i_ino which is 'unsigned long'.
+	 * So for now the implementation limit is 32 bits.
+	 */
+	if (!sb->s_cop->has_32bit_inodes) {
 		fscrypt_warn(inode,
 			     "Can't use %s policy on filesystem '%s' because its inode numbers are too long",
 			     type, sb->s_id);
 		return false;
 	}
-	if (lblk_bits > max_lblk_bits) {
+
+	/*
+	 * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
+	 * indices fit in 32 bits.
+	 */
+	if (fscrypt_max_file_dun_bits(sb,
+			fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
 		fscrypt_warn(inode,
-			     "Can't use %s policy on filesystem '%s' because its block numbers are too long",
+			     "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
 			     type, sb->s_id);
 		return false;
 	}
@@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
 		return false;
 	}
 
+	if (policy->log2_data_unit_size) {
+		if (!inode->i_sb->s_cop->supports_subblock_data_units) {
+			fscrypt_warn(inode,
+				     "Filesystem does not support configuring crypto data unit size");
+			return false;
+		}
+		if (policy->log2_data_unit_size > inode->i_blkbits ||
+		    policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
+			fscrypt_warn(inode,
+				     "Unsupported log2_data_unit_size in encryption policy: %d",
+				     policy->log2_data_unit_size);
+			return false;
+		}
+		if (policy->log2_data_unit_size != inode->i_blkbits &&
+		    (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+			/*
+			 * Not safe to enable yet, as we need to ensure that DUN
+			 * wraparound can only occur on a FS block boundary.
+			 */
+			fscrypt_warn(inode,
+				     "Sub-block data units not yet supported with IV_INO_LBLK_32");
+			return false;
+		}
+	}
+
 	if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
 	    !supported_direct_key_modes(inode, policy->contents_encryption_mode,
 					policy->filenames_encryption_mode))
 		return false;
 
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64",
-					  32, 32))
-		return false;
-
-	/*
-	 * IV_INO_LBLK_32 hashes the inode number, so in principle it can
-	 * support any ino_bits.  However, currently the inode number is gotten
-	 * from inode::i_ino which is 'unsigned long'.  So for now the
-	 * implementation limit is 32 bits.
-	 */
-	if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
-	    !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
-					  32, 32))
+	if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+			      FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
+	    !supported_iv_ino_lblk_policy(policy, inode))
 		return false;
 
 	if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u,
 		ctx->filenames_encryption_mode =
 			policy->filenames_encryption_mode;
 		ctx->flags = policy->flags;
+		ctx->log2_data_unit_size = policy->log2_data_unit_size;
 		memcpy(ctx->master_key_identifier,
 		       policy->master_key_identifier,
 		       sizeof(ctx->master_key_identifier));
@@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 		policy->filenames_encryption_mode =
 			ctx->filenames_encryption_mode;
 		policy->flags = ctx->flags;
+		policy->log2_data_unit_size = ctx->log2_data_unit_size;
 		memcpy(policy->__reserved, ctx->__reserved,
 		       sizeof(policy->__reserved));
 		memcpy(policy->master_key_identifier,
@@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
 /* Retrieve an inode's encryption policy */
 static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
 {
-	const struct fscrypt_info *ci;
+	const struct fscrypt_inode_info *ci;
 	union fscrypt_context ctx;
 	int ret;
 
-	ci = fscrypt_get_info(inode);
+	ci = fscrypt_get_inode_info(inode);
 	if (ci) {
 		/* key available, use the cached policy */
 		*policy = ci->ci_policy;
@@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 
 	/*
 	 * Both parent and child are encrypted, so verify they use the same
-	 * encryption policy.  Compare the fscrypt_info structs if the keys are
+	 * encryption policy.  Compare the cached policies if the keys are
 	 * available, otherwise retrieve and compare the fscrypt_contexts.
 	 *
 	 * Note that the fscrypt_context retrieval will be required frequently
@@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
  */
 int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 
 	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
 			FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
  */
 int fscrypt_set_context(struct inode *inode, void *fs_data)
 {
-	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct fscrypt_inode_info *ci = inode->i_crypt_info;
 	union fscrypt_context ctx;
 	int ctxsize;
 
diff --git a/fs/dax.c b/fs/dax.c
index 8fafecbe42b1..3380b43cb6bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry)
 	return NULL;
 }
 
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
+/**
+ * dax_lock_folio - Lock the DAX entry corresponding to a folio
+ * @folio: The folio whose entry we want to lock
  *
  * Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
  * not be locked.
  */
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock_folio(struct folio *folio)
 {
 	XA_STATE(xas, NULL, 0);
 	void *entry;
 
-	/* Ensure page->mapping isn't freed while we look at it */
+	/* Ensure folio->mapping isn't freed while we look at it */
 	rcu_read_lock();
 	for (;;) {
-		struct address_space *mapping = READ_ONCE(page->mapping);
+		struct address_space *mapping = READ_ONCE(folio->mapping);
 
 		entry = NULL;
 		if (!mapping || !dax_mapping(mapping))
@@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page)
 
 		xas.xa = &mapping->i_pages;
 		xas_lock_irq(&xas);
-		if (mapping != page->mapping) {
+		if (mapping != folio->mapping) {
 			xas_unlock_irq(&xas);
 			continue;
 		}
-		xas_set(&xas, page->index);
+		xas_set(&xas, folio->index);
 		entry = xas_load(&xas);
 		if (dax_is_locked(entry)) {
 			rcu_read_unlock();
@@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page)
 	return (dax_entry_t)entry;
 }
 
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
 {
-	struct address_space *mapping = page->mapping;
-	XA_STATE(xas, &mapping->i_pages, page->index);
+	struct address_space *mapping = folio->mapping;
+	XA_STATE(xas, &mapping->i_pages, folio->index);
 
 	if (S_ISCHR(mapping->host->i_mode))
 		return;
diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..c82ae731df9a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
-static struct kmem_cache *dentry_cache __read_mostly;
+static struct kmem_cache *dentry_cache __ro_after_init;
 
 const struct qstr empty_name = QSTR_INIT("", 0);
 EXPORT_SYMBOL(empty_name);
@@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name);
  * information, yet avoid using a prime hash-size or similar.
  */
 
-static unsigned int d_hash_shift __read_mostly;
+static unsigned int d_hash_shift __ro_after_init;
 
-static struct hlist_bl_head *dentry_hashtable __read_mostly;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init;
 
 static inline struct hlist_bl_head *d_hash(unsigned int hash)
 {
@@ -3246,11 +3246,10 @@ void d_genocide(struct dentry *parent)
 	d_walk(parent, parent, d_genocide_kill);
 }
 
-void d_tmpfile(struct file *file, struct inode *inode)
+void d_mark_tmpfile(struct file *file, struct inode *inode)
 {
 	struct dentry *dentry = file->f_path.dentry;
 
-	inode_dec_link_count(inode);
 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
@@ -3260,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode)
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dentry->d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct file *file, struct inode *inode)
+{
+	struct dentry *dentry = file->f_path.dentry;
+
+	inode_dec_link_count(inode);
+	d_mark_tmpfile(file, inode);
 	d_instantiate(dentry, inode);
 }
 EXPORT_SYMBOL(d_tmpfile);
@@ -3324,7 +3332,7 @@ static void __init dcache_init(void)
 }
 
 /* SLAB cache for __getname() consumers */
-struct kmem_cache *names_cachep __read_mostly;
+struct kmem_cache *names_cachep __ro_after_init;
 EXPORT_SYMBOL(names_cachep);
 
 void __init vfs_caches_init_early(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 87b3753aa4b1..6d7c1a49581f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -84,6 +84,14 @@ int debugfs_file_get(struct dentry *dentry)
 	struct debugfs_fsdata *fsd;
 	void *d_fsd;
 
+	/*
+	 * This could only happen if some debugfs user erroneously calls
+	 * debugfs_file_get() on a dentry that isn't even a file, let
+	 * them know about it.
+	 */
+	if (WARN_ON(!d_is_reg(dentry)))
+		return -EINVAL;
+
 	d_fsd = READ_ONCE(dentry->d_fsdata);
 	if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
 		fsd = d_fsd;
@@ -96,7 +104,11 @@ int debugfs_file_get(struct dentry *dentry)
 					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
 		refcount_set(&fsd->active_users, 1);
 		init_completion(&fsd->active_users_drained);
+		INIT_LIST_HEAD(&fsd->cancellations);
+		mutex_init(&fsd->cancellations_mtx);
+
 		if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+			mutex_destroy(&fsd->cancellations_mtx);
 			kfree(fsd);
 			fsd = READ_ONCE(dentry->d_fsdata);
 		}
@@ -138,6 +150,86 @@ void debugfs_file_put(struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(debugfs_file_put);
 
+/**
+ * debugfs_enter_cancellation - enter a debugfs cancellation
+ * @file: the file being accessed
+ * @cancellation: the cancellation object, the cancel callback
+ *	inside of it must be initialized
+ *
+ * When a debugfs file is removed it needs to wait for all active
+ * operations to complete. However, the operation itself may need
+ * to wait for hardware or completion of some asynchronous process
+ * or similar. As such, it may need to be cancelled to avoid long
+ * waits or even deadlocks.
+ *
+ * This function can be used inside a debugfs handler that may
+ * need to be cancelled. As soon as this function is called, the
+ * cancellation's 'cancel' callback may be called, at which point
+ * the caller should proceed to call debugfs_leave_cancellation()
+ * and leave the debugfs handler function as soon as possible.
+ * Note that the 'cancel' callback is only ever called in the
+ * context of some kind of debugfs_remove().
+ *
+ * This function must be paired with debugfs_leave_cancellation().
+ */
+void debugfs_enter_cancellation(struct file *file,
+				struct debugfs_cancellation *cancellation)
+{
+	struct debugfs_fsdata *fsd;
+	struct dentry *dentry = F_DENTRY(file);
+
+	INIT_LIST_HEAD(&cancellation->list);
+
+	if (WARN_ON(!d_is_reg(dentry)))
+		return;
+
+	if (WARN_ON(!cancellation->cancel))
+		return;
+
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if (WARN_ON(!fsd ||
+		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return;
+
+	mutex_lock(&fsd->cancellations_mtx);
+	list_add(&cancellation->list, &fsd->cancellations);
+	mutex_unlock(&fsd->cancellations_mtx);
+
+	/* if we're already removing wake it up to cancel */
+	if (d_unlinked(dentry))
+		complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_enter_cancellation);
+
+/**
+ * debugfs_leave_cancellation - leave cancellation section
+ * @file: the file being accessed
+ * @cancellation: the cancellation previously registered with
+ *	debugfs_enter_cancellation()
+ *
+ * See the documentation of debugfs_enter_cancellation().
+ */
+void debugfs_leave_cancellation(struct file *file,
+				struct debugfs_cancellation *cancellation)
+{
+	struct debugfs_fsdata *fsd;
+	struct dentry *dentry = F_DENTRY(file);
+
+	if (WARN_ON(!d_is_reg(dentry)))
+		return;
+
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if (WARN_ON(!fsd ||
+		    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+		return;
+
+	mutex_lock(&fsd->cancellations_mtx);
+	if (!list_empty(&cancellation->list))
+		list_del(&cancellation->list);
+	mutex_unlock(&fsd->cancellations_mtx);
+}
+EXPORT_SYMBOL_GPL(debugfs_leave_cancellation);
+
 /*
  * Only permit access to world-readable files when the kernel is locked down.
  * We also need to exclude any file that has ways to write or alter it as root
@@ -939,7 +1031,7 @@ static ssize_t debugfs_write_file_str(struct file *file, const char __user *user
 	new[pos + count] = '\0';
 	strim(new);
 
-	rcu_assign_pointer(*(char **)file->private_data, new);
+	rcu_assign_pointer(*(char __rcu **)file->private_data, new);
 	synchronize_rcu();
 	kfree(old);
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 83e57e9f9fa0..034a617cb1a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,7 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
@@ -236,17 +236,25 @@ static const struct super_operations debugfs_super_operations = {
 
 static void debugfs_release_dentry(struct dentry *dentry)
 {
-	void *fsd = dentry->d_fsdata;
+	struct debugfs_fsdata *fsd = dentry->d_fsdata;
 
-	if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
-		kfree(dentry->d_fsdata);
+	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+		return;
+
+	/* check it wasn't a dir (no fsdata) or automount (no real_fops) */
+	if (fsd && fsd->real_fops) {
+		WARN_ON(!list_empty(&fsd->cancellations));
+		mutex_destroy(&fsd->cancellations_mtx);
+	}
+
+	kfree(fsd);
 }
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	debugfs_automount_t f;
-	f = (debugfs_automount_t)path->dentry->d_fsdata;
-	return f(path->dentry, d_inode(path->dentry)->i_private);
+	struct debugfs_fsdata *fsd = path->dentry->d_fsdata;
+
+	return fsd->automount(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -634,13 +642,23 @@ struct dentry *debugfs_create_automount(const char *name,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
+	struct debugfs_fsdata *fsd;
 	struct inode *inode;
 
 	if (IS_ERR(dentry))
 		return dentry;
 
+	fsd = kzalloc(sizeof(*fsd), GFP_KERNEL);
+	if (!fsd) {
+		failed_creating(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	fsd->automount = f;
+
 	if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
 		failed_creating(dentry);
+		kfree(fsd);
 		return ERR_PTR(-EPERM);
 	}
 
@@ -648,13 +666,14 @@ struct dentry *debugfs_create_automount(const char *name,
 	if (unlikely(!inode)) {
 		pr_err("out of free dentries, can not create automount '%s'\n",
 		       name);
+		kfree(fsd);
 		return failed_creating(dentry);
 	}
 
 	make_empty_dir_inode(inode);
 	inode->i_flags |= S_AUTOMOUNT;
 	inode->i_private = data;
-	dentry->d_fsdata = (void *)f;
+	dentry->d_fsdata = fsd;
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
 	d_instantiate(dentry, inode);
@@ -731,8 +750,37 @@ static void __debugfs_file_removed(struct dentry *dentry)
 	fsd = READ_ONCE(dentry->d_fsdata);
 	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
 		return;
-	if (!refcount_dec_and_test(&fsd->active_users))
+
+	/* if we hit zero, just wait for all to finish */
+	if (!refcount_dec_and_test(&fsd->active_users)) {
 		wait_for_completion(&fsd->active_users_drained);
+		return;
+	}
+
+	/* if we didn't hit zero, try to cancel any we can */
+	while (refcount_read(&fsd->active_users)) {
+		struct debugfs_cancellation *c;
+
+		/*
+		 * Lock the cancellations. Note that the cancellations
+		 * structs are meant to be on the stack, so we need to
+		 * ensure we either use them here or don't touch them,
+		 * and debugfs_leave_cancellation() will wait for this
+		 * to be finished processing before exiting one. It may
+		 * of course win and remove the cancellation, but then
+		 * chances are we never even got into this bit, we only
+		 * do if the refcount isn't zero already.
+		 */
+		mutex_lock(&fsd->cancellations_mtx);
+		while ((c = list_first_entry_or_null(&fsd->cancellations,
+						     typeof(*c), list))) {
+			list_del_init(&c->list);
+			c->cancel(dentry, c->cancel_data);
+		}
+		mutex_unlock(&fsd->cancellations_mtx);
+
+		wait_for_completion(&fsd->active_users_drained);
+	}
 }
 
 static void remove_one(struct dentry *victim)
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 92af8ae31313..dae80c2a469e 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -7,6 +7,7 @@
 
 #ifndef _DEBUGFS_INTERNAL_H_
 #define _DEBUGFS_INTERNAL_H_
+#include <linux/list.h>
 
 struct file_operations;
 
@@ -17,8 +18,18 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
 
 struct debugfs_fsdata {
 	const struct file_operations *real_fops;
-	refcount_t active_users;
-	struct completion active_users_drained;
+	union {
+		/* automount_fn is used when real_fops is NULL */
+		debugfs_automount_t automount;
+		struct {
+			refcount_t active_users;
+			struct completion active_users_drained;
+
+			/* protect cancellations */
+			struct mutex cancellations_mtx;
+			struct list_head cancellations;
+		};
+	};
 };
 
 /*
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 299c295a27a0..c830261aa883 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
 	}
 
 	inode->i_ino = 2;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	if (!inode)
 		goto fail;
 	inode->i_ino = 1;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
 	inode->i_ino = index + 3;
 	inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
 	inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
 
 	sprintf(s, "%d", index);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7bc494ee56b9..20533266ade6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -151,7 +151,7 @@ struct dio {
 	};
 } ____cacheline_aligned_in_smp;
 
-static struct kmem_cache *dio_cache __read_mostly;
+static struct kmem_cache *dio_cache __ro_after_init;
 
 /*
  * How many pages are in the queue?
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 5aabcb6f0f15..42f332f46359 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -973,7 +973,8 @@ void dlm_delete_debug_comms_file(void *ctx)
 
 void dlm_create_debug_file(struct dlm_ls *ls)
 {
-	char name[DLM_LOCKSPACE_LEN + 8];
+	/* Reserve enough space for the longest file name */
+	char name[DLM_LOCKSPACE_LEN + sizeof("_queued_asts")];
 
 	/* format 1 */
 
@@ -985,8 +986,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
 
 	/* format 2 */
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name);
+	snprintf(name, sizeof(name), "%s_locks", ls->ls_name);
 
 	ls->ls_debug_locks_dentry = debugfs_create_file(name,
 							0644,
@@ -996,8 +996,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
 
 	/* format 3 */
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name);
+	snprintf(name, sizeof(name), "%s_all", ls->ls_name);
 
 	ls->ls_debug_all_dentry = debugfs_create_file(name,
 						      S_IFREG | S_IRUGO,
@@ -1007,8 +1006,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
 
 	/* format 4 */
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name);
+	snprintf(name, sizeof(name), "%s_toss", ls->ls_name);
 
 	ls->ls_debug_toss_dentry = debugfs_create_file(name,
 						       S_IFREG | S_IRUGO,
@@ -1016,8 +1014,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
 						       ls,
 						       &format4_fops);
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
+	snprintf(name, sizeof(name), "%s_waiters", ls->ls_name);
 
 	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
 							  0644,
@@ -1027,8 +1024,7 @@ void dlm_create_debug_file(struct dlm_ls *ls)
 
 	/* format 5 */
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_queued_asts", ls->ls_name);
+	snprintf(name, sizeof(name), "%s_queued_asts", ls->ls_name);
 
 	ls->ls_debug_queued_asts_dentry = debugfs_create_file(name,
 							      0644,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index f7bc22e74db2..67f8dd8a05ef 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,6 +63,7 @@
 #include "config.h"
 
 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000)
+#define DLM_MAX_PROCESS_BUFFERS 24
 #define NEEDED_RMEM (4*1024*1024)
 
 struct connection {
@@ -194,6 +195,7 @@ static const struct dlm_proto_ops *dlm_proto_ops;
 #define DLM_IO_END 1
 #define DLM_IO_EOF 2
 #define DLM_IO_RESCHED 3
+#define DLM_IO_FLUSH 4
 
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
@@ -202,6 +204,7 @@ static void process_dlm_messages(struct work_struct *work);
 static DECLARE_WORK(process_work, process_dlm_messages);
 static DEFINE_SPINLOCK(processqueue_lock);
 static bool process_dlm_messages_pending;
+static atomic_t processqueue_count;
 static LIST_HEAD(processqueue);
 
 bool dlm_lowcomms_is_running(void)
@@ -874,6 +877,7 @@ static void process_dlm_messages(struct work_struct *work)
 	}
 
 	list_del(&pentry->list);
+	atomic_dec(&processqueue_count);
 	spin_unlock(&processqueue_lock);
 
 	for (;;) {
@@ -891,6 +895,7 @@ static void process_dlm_messages(struct work_struct *work)
 		}
 
 		list_del(&pentry->list);
+		atomic_dec(&processqueue_count);
 		spin_unlock(&processqueue_lock);
 	}
 }
@@ -962,6 +967,7 @@ again:
 		con->rx_leftover);
 
 	spin_lock(&processqueue_lock);
+	ret = atomic_inc_return(&processqueue_count);
 	list_add_tail(&pentry->list, &processqueue);
 	if (!process_dlm_messages_pending) {
 		process_dlm_messages_pending = true;
@@ -969,6 +975,9 @@ again:
 	}
 	spin_unlock(&processqueue_lock);
 
+	if (ret > DLM_MAX_PROCESS_BUFFERS)
+		return DLM_IO_FLUSH;
+
 	return DLM_IO_SUCCESS;
 }
 
@@ -1503,6 +1512,9 @@ static void process_recv_sockets(struct work_struct *work)
 		wake_up(&con->shutdown_wait);
 		/* CF_RECV_PENDING cleared */
 		break;
+	case DLM_IO_FLUSH:
+		flush_workqueue(process_workqueue);
+		fallthrough;
 	case DLM_IO_RESCHED:
 		cond_resched();
 		queue_work(io_workqueue, &con->rwork);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index f641b36a36db..2247ebb61be1 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -337,13 +337,21 @@ static struct midcomms_node *nodeid2node(int nodeid)
 
 int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
 {
-	int ret, r = nodeid_hash(nodeid);
+	int ret, idx, r = nodeid_hash(nodeid);
 	struct midcomms_node *node;
 
 	ret = dlm_lowcomms_addr(nodeid, addr, len);
 	if (ret)
 		return ret;
 
+	idx = srcu_read_lock(&nodes_srcu);
+	node = __find_node(nodeid, r);
+	if (node) {
+		srcu_read_unlock(&nodes_srcu, idx);
+		return 0;
+	}
+	srcu_read_unlock(&nodes_srcu, idx);
+
 	node = kmalloc(sizeof(*node), GFP_NOFS);
 	if (!node)
 		return -ENOMEM;
@@ -1030,15 +1038,15 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
 
 		break;
 	case DLM_VERSION_3_2:
+		/* send ack back if necessary */
+		dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD);
+
 		msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
 					       ppc);
 		if (!msg) {
 			dlm_free_mhandle(mh);
 			goto err;
 		}
-
-		/* send ack back if necessary */
-		dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD);
 		break;
 	default:
 		dlm_free_mhandle(mh);
@@ -1260,12 +1268,23 @@ void dlm_midcomms_remove_member(int nodeid)
 
 	idx = srcu_read_lock(&nodes_srcu);
 	node = nodeid2node(nodeid);
-	if (WARN_ON_ONCE(!node)) {
+	/* in case of dlm_midcomms_close() removes node */
+	if (!node) {
 		srcu_read_unlock(&nodes_srcu, idx);
 		return;
 	}
 
 	spin_lock(&node->state_lock);
+	/* case of dlm_midcomms_addr() created node but
+	 * was not added before because dlm_midcomms_close()
+	 * removed the node
+	 */
+	if (!node->users) {
+		spin_unlock(&node->state_lock);
+		srcu_read_unlock(&nodes_srcu, idx);
+		return;
+	}
+
 	node->users--;
 	pr_debug("node %d users dec count %d\n", nodeid, node->users);
 
@@ -1386,10 +1405,16 @@ void dlm_midcomms_shutdown(void)
 			midcomms_shutdown(node);
 		}
 	}
-	srcu_read_unlock(&nodes_srcu, idx);
-	mutex_unlock(&close_lock);
 
 	dlm_lowcomms_shutdown();
+
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+			midcomms_node_reset(node);
+		}
+	}
+	srcu_read_unlock(&nodes_srcu, idx);
+	mutex_unlock(&close_lock);
 }
 
 int dlm_midcomms_close(int nodeid)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f2ed0c0266cb..c586c5db18b5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
 int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
 		       loff_t offset);
 
-extern const struct xattr_handler *ecryptfs_xattr_handlers[];
+extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 992d9c7e64ae..b0e8774c435a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -998,6 +998,14 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
 	return rc;
 }
 
+static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
+			       u32 request_mask, unsigned int flags)
+{
+	if (flags & AT_GETATTR_NOSEC)
+		return vfs_getattr_nosec(path, stat, request_mask, flags);
+	return vfs_getattr(path, stat, request_mask, flags);
+}
+
 static int ecryptfs_getattr(struct mnt_idmap *idmap,
 			    const struct path *path, struct kstat *stat,
 			    u32 request_mask, unsigned int flags)
@@ -1006,8 +1014,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
 	struct kstat lower_stat;
 	int rc;
 
-	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat,
-			 request_mask, flags);
+	rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
+				 &lower_stat, request_mask, flags);
 	if (!rc) {
 		fsstack_copy_attr_all(d_inode(dentry),
 				      ecryptfs_inode_to_lower(d_inode(dentry)));
@@ -1210,7 +1218,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
 	.set = ecryptfs_xattr_set,
 };
 
-const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+const struct xattr_handler * const ecryptfs_xattr_handlers[] = {
 	&ecryptfs_xattr_handler,
 	NULL
 };
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 59b52718a3a2..7e9961639802 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	} else {
 		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		inode_unlock(inode);
 	}
 
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index db9231f0e77b..91290fe4a70b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -21,11 +21,15 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
 				dev_t dev, bool is_removable)
 {
 	struct inode *inode = new_inode(sb);
+	struct efivarfs_fs_info *fsi = sb->s_fs_info;
+	struct efivarfs_mount_opts *opts = &fsi->mount_opts;
 
 	if (inode) {
+		inode->i_uid = opts->uid;
+		inode->i_gid = opts->gid;
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
 		switch (mode & S_IFMT) {
 		case S_IFREG:
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 8ebf3a6a8aa2..c66647f5c0bd 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -9,6 +9,15 @@
 #include <linux/list.h>
 #include <linux/efi.h>
 
+struct efivarfs_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
+};
+
+struct efivarfs_fs_info {
+	struct efivarfs_mount_opts mount_opts;
+};
+
 struct efi_variable {
 	efi_char16_t  VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
 	efi_guid_t    VendorGuid;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 996271473609..77240953a92e 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -8,6 +8,7 @@
 #include <linux/efi.h>
 #include <linux/fs.h>
 #include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/ucs2_string.h>
@@ -24,12 +25,28 @@ static void efivarfs_evict_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
+static int efivarfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct super_block *sb = root->d_sb;
+	struct efivarfs_fs_info *sbi = sb->s_fs_info;
+	struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+
+	if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+				from_kuid_munged(&init_user_ns, opts->uid));
+	if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+				from_kgid_munged(&init_user_ns, opts->gid));
+	return 0;
+}
+
 static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	const u32 attr = EFI_VARIABLE_NON_VOLATILE |
 			 EFI_VARIABLE_BOOTSERVICE_ACCESS |
 			 EFI_VARIABLE_RUNTIME_ACCESS;
 	u64 storage_space, remaining_space, max_variable_size;
+	u64 id = huge_encode_dev(dentry->d_sb->s_dev);
 	efi_status_t status;
 
 	/* Some UEFI firmware does not implement QueryVariableInfo() */
@@ -53,6 +70,7 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks	= storage_space;
 	buf->f_bfree	= remaining_space;
 	buf->f_type	= dentry->d_sb->s_magic;
+	buf->f_fsid	= u64_to_fsid(id);
 
 	/*
 	 * In f_bavail we declare the free space that the kernel will allow writing
@@ -70,6 +88,7 @@ static const struct super_operations efivarfs_ops = {
 	.statfs = efivarfs_statfs,
 	.drop_inode = generic_delete_inode,
 	.evict_inode = efivarfs_evict_inode,
+	.show_options = efivarfs_show_options,
 };
 
 /*
@@ -231,6 +250,45 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data)
 	return 0;
 }
 
+enum {
+	Opt_uid, Opt_gid,
+};
+
+static const struct fs_parameter_spec efivarfs_parameters[] = {
+	fsparam_u32("uid", Opt_uid),
+	fsparam_u32("gid", Opt_gid),
+	{},
+};
+
+static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct efivarfs_fs_info *sbi = fc->s_fs_info;
+	struct efivarfs_mount_opts *opts = &sbi->mount_opts;
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, efivarfs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_uid:
+		opts->uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(opts->uid))
+			return -EINVAL;
+		break;
+	case Opt_gid:
+		opts->gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(opts->gid))
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode = NULL;
@@ -277,10 +335,21 @@ static int efivarfs_get_tree(struct fs_context *fc)
 
 static const struct fs_context_operations efivarfs_context_ops = {
 	.get_tree	= efivarfs_get_tree,
+	.parse_param	= efivarfs_parse_param,
 };
 
 static int efivarfs_init_fs_context(struct fs_context *fc)
 {
+	struct efivarfs_fs_info *sfi;
+
+	sfi = kzalloc(sizeof(*sfi), GFP_KERNEL);
+	if (!sfi)
+		return -ENOMEM;
+
+	sfi->mount_opts.uid = GLOBAL_ROOT_UID;
+	sfi->mount_opts.gid = GLOBAL_ROOT_GID;
+
+	fc->s_fs_info = sfi;
 	fc->ops = &efivarfs_context_ops;
 	return 0;
 }
@@ -301,6 +370,7 @@ static struct file_system_type efivarfs_type = {
 	.name    = "efivarfs",
 	.init_fs_context = efivarfs_init_fs_context,
 	.kill_sb = efivarfs_kill_sb,
+	.parameters = efivarfs_parameters,
 };
 
 static __init int efivarfs_init(void)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3789d22ba501..7844ab24b813 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
 	i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
 	i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
 	inode->i_size  = be32_to_cpu(efs_inode->di_size);
-	inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
-	inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+	inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0);
+	inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0);
 	inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
-	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
 	/* this is the number of blocks in the file */
 	if (inode->i_size == 0) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index b287f47c165b..f17fdac76b2e 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = {
 };
 
 static const struct export_operations efs_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= efs_fh_to_dentry,
 	.fh_to_parent	= efs_fh_to_parent,
 	.get_parent	= efs_get_parent,
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f6dc961e6c2b..1d318f85232d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -21,7 +21,7 @@ config EROFS_FS
 	  performance under extremely memory pressure without extra cost.
 
 	  See the documentation at <file:Documentation/filesystems/erofs.rst>
-	  for more details.
+	  and the web pages at <https://erofs.docs.kernel.org> for more details.
 
 	  If unsure, say N.
 
@@ -91,13 +91,10 @@ config EROFS_FS_ZIP_LZMA
 	select XZ_DEC_MICROLZMA
 	help
 	  Saying Y here includes support for reading EROFS file systems
-	  containing LZMA compressed data, specifically called microLZMA. it
-	  gives better compression ratios than the LZ4 algorithm, at the
+	  containing LZMA compressed data, specifically called microLZMA. It
+	  gives better compression ratios than the default LZ4 format, at the
 	  expense of more CPU overhead.
 
-	  LZMA support is an experimental feature for now and so most file
-	  systems will be readable without selecting this option.
-
 	  If unsure, say N.
 
 config EROFS_FS_ZIP_DEFLATE
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 349c3316ae6b..279933e007d2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -21,6 +21,8 @@ struct z_erofs_decompress_req {
 };
 
 struct z_erofs_decompressor {
+	int (*config)(struct super_block *sb, struct erofs_super_block *dsb,
+		      void *data, int size);
 	int (*decompress)(struct z_erofs_decompress_req *rq,
 			  struct page **pagepool);
 	char *name;
@@ -92,6 +94,10 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
 extern const struct z_erofs_decompressor erofs_decompressors[];
 
 /* prototypes for specific algorithms */
+int z_erofs_load_lzma_config(struct super_block *sb,
+			struct erofs_super_block *dsb, void *data, int size);
+int z_erofs_load_deflate_config(struct super_block *sb,
+			struct erofs_super_block *dsb, void *data, int size);
 int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
 			    struct page **pagepool);
 int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0c2c99c58b5e..c98aeda8abb2 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,9 +5,7 @@
  * Copyright (C) 2021, Alibaba Cloud
  */
 #include "internal.h"
-#include <linux/prefetch.h>
 #include <linux/sched/mm.h>
-#include <linux/dax.h>
 #include <trace/events/erofs.h>
 
 void erofs_unmap_metabuf(struct erofs_buf *buf)
@@ -222,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			up_read(&devs->rwsem);
 			return 0;
 		}
-		map->m_bdev = dif->bdev;
+		map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
 		map->m_daxdev = dif->dax_dev;
 		map->m_dax_part_off = dif->dax_part_off;
 		map->m_fscache = dif->fscache;
@@ -240,7 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
 				map->m_pa -= startoff;
-				map->m_bdev = dif->bdev;
+				map->m_bdev = dif->bdev_handle ?
+					      dif->bdev_handle->bdev : NULL;
 				map->m_daxdev = dif->dax_dev;
 				map->m_dax_part_off = dif->dax_part_off;
 				map->m_fscache = dif->fscache;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 332ec5f74002..021be5feb1bc 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -4,7 +4,6 @@
  *             https://www.huawei.com/
  */
 #include "compress.h"
-#include <linux/module.h>
 #include <linux/lz4.h>
 
 #ifndef LZ4_DISTANCE_MAX	/* history window size */
@@ -24,11 +23,11 @@ struct z_erofs_lz4_decompress_ctx {
 	unsigned int oend;
 };
 
-int z_erofs_load_lz4_config(struct super_block *sb,
-			    struct erofs_super_block *dsb,
-			    struct z_erofs_lz4_cfgs *lz4, int size)
+static int z_erofs_load_lz4_config(struct super_block *sb,
+			    struct erofs_super_block *dsb, void *data, int size)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct z_erofs_lz4_cfgs *lz4 = data;
 	u16 distance;
 
 	if (lz4) {
@@ -370,19 +369,75 @@ const struct z_erofs_decompressor erofs_decompressors[] = {
 		.name = "interlaced"
 	},
 	[Z_EROFS_COMPRESSION_LZ4] = {
+		.config = z_erofs_load_lz4_config,
 		.decompress = z_erofs_lz4_decompress,
 		.name = "lz4"
 	},
 #ifdef CONFIG_EROFS_FS_ZIP_LZMA
 	[Z_EROFS_COMPRESSION_LZMA] = {
+		.config = z_erofs_load_lzma_config,
 		.decompress = z_erofs_lzma_decompress,
 		.name = "lzma"
 	},
 #endif
 #ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
 	[Z_EROFS_COMPRESSION_DEFLATE] = {
+		.config = z_erofs_load_deflate_config,
 		.decompress = z_erofs_deflate_decompress,
 		.name = "deflate"
 	},
 #endif
 };
+
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	unsigned int algs, alg;
+	erofs_off_t offset;
+	int size, ret = 0;
+
+	if (!erofs_sb_has_compr_cfgs(sbi)) {
+		sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+		return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+	}
+
+	sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+	if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+		erofs_err(sb, "unidentified algorithms %x, please upgrade kernel",
+			  sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+		return -EOPNOTSUPP;
+	}
+
+	erofs_init_metabuf(&buf, sb);
+	offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+	alg = 0;
+	for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+		void *data;
+
+		if (!(algs & 1))
+			continue;
+
+		data = erofs_read_metadata(sb, &buf, &offset, &size);
+		if (IS_ERR(data)) {
+			ret = PTR_ERR(data);
+			break;
+		}
+
+		if (alg >= ARRAY_SIZE(erofs_decompressors) ||
+		    !erofs_decompressors[alg].config) {
+			erofs_err(sb, "algorithm %d isn't enabled on this kernel",
+				  alg);
+			ret = -EOPNOTSUPP;
+		} else {
+			ret = erofs_decompressors[alg].config(sb,
+					dsb, data, size);
+		}
+
+		kfree(data);
+		if (ret)
+			break;
+	}
+	erofs_put_metabuf(&buf);
+	return ret;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 19e5bdeb30b6..daf3c1bdeab8 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/module.h>
 #include <linux/zlib.h>
 #include "compress.h"
 
@@ -77,9 +76,10 @@ out_failed:
 }
 
 int z_erofs_load_deflate_config(struct super_block *sb,
-				struct erofs_super_block *dsb,
-				struct z_erofs_deflate_cfgs *dfl, int size)
+			struct erofs_super_block *dsb, void *data, int size)
 {
+	struct z_erofs_deflate_cfgs *dfl = data;
+
 	if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
 		erofs_err(sb, "invalid deflate cfgs, size=%u", size);
 		return -EINVAL;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index dee10d22ada9..2dd14f99c1dc 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <linux/xz.h>
-#include <linux/module.h>
 #include "compress.h"
 
 struct z_erofs_lzma {
@@ -72,10 +71,10 @@ int __init z_erofs_lzma_init(void)
 }
 
 int z_erofs_load_lzma_config(struct super_block *sb,
-			     struct erofs_super_block *dsb,
-			     struct z_erofs_lzma_cfgs *lzma, int size)
+			struct erofs_super_block *dsb, void *data, int size)
 {
 	static DEFINE_MUTEX(lzma_resize_mutex);
+	struct z_erofs_lzma_cfgs *lzma = data;
 	unsigned int dict_size, i;
 	struct z_erofs_lzma *strm, *head = NULL;
 	int err;
@@ -96,8 +95,6 @@ int z_erofs_load_lzma_config(struct super_block *sb,
 		return -EINVAL;
 	}
 
-	erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
-
 	/* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
 	mutex_lock(&lzma_resize_mutex);
 
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index edc8ec7581b8..14a79d3226ab 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -15,11 +15,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_inode *vi = EROFS_I(inode);
 	const erofs_off_t inode_loc = erofs_iloc(inode);
-
 	erofs_blk_t blkaddr, nblks = 0;
 	void *kaddr;
 	struct erofs_inode_compact *dic;
 	struct erofs_inode_extended *die, *copied = NULL;
+	union erofs_inode_i_u iu;
 	unsigned int ifmt;
 	int err;
 
@@ -35,9 +35,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 
 	dic = kaddr + *ofs;
 	ifmt = le16_to_cpu(dic->i_format);
-
 	if (ifmt & ~EROFS_I_ALL) {
-		erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+		erofs_err(sb, "unsupported i_format %u of nid %llu",
 			  ifmt, vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
@@ -45,7 +44,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 
 	vi->datalayout = erofs_inode_datalayout(ifmt);
 	if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
-		erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+		erofs_err(sb, "unsupported datalayout %u of nid %llu",
 			  vi->datalayout, vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
@@ -82,40 +81,15 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
 
 		inode->i_mode = le16_to_cpu(die->i_mode);
-		switch (inode->i_mode & S_IFMT) {
-		case S_IFREG:
-		case S_IFDIR:
-		case S_IFLNK:
-			vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
-			break;
-		case S_IFCHR:
-		case S_IFBLK:
-			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(die->i_u.rdev));
-			break;
-		case S_IFIFO:
-		case S_IFSOCK:
-			inode->i_rdev = 0;
-			break;
-		default:
-			goto bogusimode;
-		}
+		iu = die->i_u;
 		i_uid_write(inode, le32_to_cpu(die->i_uid));
 		i_gid_write(inode, le32_to_cpu(die->i_gid));
 		set_nlink(inode, le32_to_cpu(die->i_nlink));
-
-		/* extended inode has its own timestamp */
+		/* each extended inode has its own timestamp */
 		inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
 				le32_to_cpu(die->i_mtime_nsec));
 
 		inode->i_size = le64_to_cpu(die->i_size);
-
-		/* total blocks for compressed files */
-		if (erofs_inode_is_data_compressed(vi->datalayout))
-			nblks = le32_to_cpu(die->i_u.compressed_blocks);
-		else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
-			/* fill chunked inode summary info */
-			vi->chunkformat = le16_to_cpu(die->i_u.c.format);
 		kfree(copied);
 		copied = NULL;
 		break;
@@ -125,49 +99,51 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
 
 		inode->i_mode = le16_to_cpu(dic->i_mode);
-		switch (inode->i_mode & S_IFMT) {
-		case S_IFREG:
-		case S_IFDIR:
-		case S_IFLNK:
-			vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
-			break;
-		case S_IFCHR:
-		case S_IFBLK:
-			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(dic->i_u.rdev));
-			break;
-		case S_IFIFO:
-		case S_IFSOCK:
-			inode->i_rdev = 0;
-			break;
-		default:
-			goto bogusimode;
-		}
+		iu = dic->i_u;
 		i_uid_write(inode, le16_to_cpu(dic->i_uid));
 		i_gid_write(inode, le16_to_cpu(dic->i_gid));
 		set_nlink(inode, le16_to_cpu(dic->i_nlink));
-
 		/* use build time for compact inodes */
 		inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
 
 		inode->i_size = le32_to_cpu(dic->i_size);
-		if (erofs_inode_is_data_compressed(vi->datalayout))
-			nblks = le32_to_cpu(dic->i_u.compressed_blocks);
-		else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
-			vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
 		break;
 	default:
-		erofs_err(inode->i_sb,
-			  "unsupported on-disk inode version %u of nid %llu",
+		erofs_err(sb, "unsupported on-disk inode version %u of nid %llu",
 			  erofs_inode_version(ifmt), vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
 
-	if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+		inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+		break;
+	case S_IFIFO:
+	case S_IFSOCK:
+		inode->i_rdev = 0;
+		break;
+	default:
+		erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode,
+			  vi->nid);
+		err = -EFSCORRUPTED;
+		goto err_out;
+	}
+
+	/* total blocks for compressed files */
+	if (erofs_inode_is_data_compressed(vi->datalayout)) {
+		nblks = le32_to_cpu(iu.compressed_blocks);
+	} else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+		/* fill chunked inode summary info */
+		vi->chunkformat = le16_to_cpu(iu.c.format);
 		if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
-			erofs_err(inode->i_sb,
-				  "unsupported chunk format %x of nid %llu",
+			erofs_err(sb, "unsupported chunk format %x of nid %llu",
 				  vi->chunkformat, vi->nid);
 			err = -EOPNOTSUPP;
 			goto err_out;
@@ -175,7 +151,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		vi->chunkbits = sb->s_blocksize_bits +
 			(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
 	}
-	inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
 
 	inode->i_flags &= ~S_DAX;
 	if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
@@ -190,10 +167,6 @@ static void *erofs_read_inode(struct erofs_buf *buf,
 		inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
 	return kaddr;
 
-bogusimode:
-	erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
-		  inode->i_mode, vi->nid);
-	err = -EFSCORRUPTED;
 err_out:
 	DBG_BUGON(1);
 	kfree(copied);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ff88d0dd980..b0409badb017 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -8,8 +8,10 @@
 #define __EROFS_INTERNAL_H
 
 #include <linux/fs.h>
+#include <linux/dax.h>
 #include <linux/dcache.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/magic.h>
@@ -47,7 +49,7 @@ typedef u32 erofs_blk_t;
 struct erofs_device_info {
 	char *path;
 	struct erofs_fscache *fscache;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct dax_device *dax_dev;
 	u64 dax_part_off;
 
@@ -228,8 +230,6 @@ struct erofs_buf {
 };
 #define __EROFS_BUF_INITIALIZER	((struct erofs_buf){ .page = NULL })
 
-#define ROOT_NID(sb)		((sb)->root_nid)
-
 #define erofs_blknr(sb, addr)	((addr) >> (sb)->s_blocksize_bits)
 #define erofs_blkoff(sb, addr)	((addr) & ((sb)->s_blocksize - 1))
 #define erofs_pos(sb, blk)	((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
@@ -469,9 +469,6 @@ int __init z_erofs_init_zip_subsystem(void);
 void z_erofs_exit_zip_subsystem(void);
 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 				       struct erofs_workgroup *egrp);
-int z_erofs_load_lz4_config(struct super_block *sb,
-			    struct erofs_super_block *dsb,
-			    struct z_erofs_lz4_cfgs *lz4, int len);
 int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 			    int flags);
 void *erofs_get_pcpubuf(unsigned int requiredpages);
@@ -480,6 +477,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages);
 void __init erofs_pcpubuf_init(void);
 void erofs_pcpubuf_exit(void);
 int erofs_init_managed_cache(struct super_block *sb);
+int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
 #else
 static inline void erofs_shrinker_register(struct super_block *sb) {}
 static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -487,16 +485,6 @@ static inline int erofs_init_shrinker(void) { return 0; }
 static inline void erofs_exit_shrinker(void) {}
 static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline int z_erofs_load_lz4_config(struct super_block *sb,
-				  struct erofs_super_block *dsb,
-				  struct z_erofs_lz4_cfgs *lz4, int len)
-{
-	if (lz4 || dsb->u1.lz4_max_distance) {
-		erofs_err(sb, "lz4 algorithm isn't enabled");
-		return -EINVAL;
-	}
-	return 0;
-}
 static inline void erofs_pcpubuf_init(void) {}
 static inline void erofs_pcpubuf_exit(void) {}
 static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
@@ -505,41 +493,17 @@ static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
 #ifdef CONFIG_EROFS_FS_ZIP_LZMA
 int __init z_erofs_lzma_init(void);
 void z_erofs_lzma_exit(void);
-int z_erofs_load_lzma_config(struct super_block *sb,
-			     struct erofs_super_block *dsb,
-			     struct z_erofs_lzma_cfgs *lzma, int size);
 #else
 static inline int z_erofs_lzma_init(void) { return 0; }
 static inline int z_erofs_lzma_exit(void) { return 0; }
-static inline int z_erofs_load_lzma_config(struct super_block *sb,
-			     struct erofs_super_block *dsb,
-			     struct z_erofs_lzma_cfgs *lzma, int size) {
-	if (lzma) {
-		erofs_err(sb, "lzma algorithm isn't enabled");
-		return -EINVAL;
-	}
-	return 0;
-}
 #endif	/* !CONFIG_EROFS_FS_ZIP_LZMA */
 
 #ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
 int __init z_erofs_deflate_init(void);
 void z_erofs_deflate_exit(void);
-int z_erofs_load_deflate_config(struct super_block *sb,
-				struct erofs_super_block *dsb,
-				struct z_erofs_deflate_cfgs *dfl, int size);
 #else
 static inline int z_erofs_deflate_init(void) { return 0; }
 static inline int z_erofs_deflate_exit(void) { return 0; }
-static inline int z_erofs_load_deflate_config(struct super_block *sb,
-			struct erofs_super_block *dsb,
-			struct z_erofs_deflate_cfgs *dfl, int size) {
-	if (dfl) {
-		erofs_err(sb, "deflate algorithm isn't enabled");
-		return -EINVAL;
-	}
-	return 0;
-}
 #endif	/* !CONFIG_EROFS_FS_ZIP_DEFLATE */
 
 #ifdef CONFIG_EROFS_FS_ONDEMAND
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3700af9ee173..3789d6224513 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -4,14 +4,11 @@
  *             https://www.huawei.com/
  * Copyright (C) 2021, Alibaba Cloud
  */
-#include <linux/module.h>
 #include <linux/statfs.h>
-#include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/crc32c.h>
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
-#include <linux/dax.h>
 #include <linux/exportfs.h>
 #include "xattr.h"
 
@@ -156,68 +153,15 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
 	return buffer;
 }
 
-#ifdef CONFIG_EROFS_FS_ZIP
-static int erofs_load_compr_cfgs(struct super_block *sb,
-				 struct erofs_super_block *dsb)
+#ifndef CONFIG_EROFS_FS_ZIP
+static int z_erofs_parse_cfgs(struct super_block *sb,
+			      struct erofs_super_block *dsb)
 {
-	struct erofs_sb_info *sbi = EROFS_SB(sb);
-	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
-	unsigned int algs, alg;
-	erofs_off_t offset;
-	int size, ret = 0;
-
-	sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
-	if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
-		erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
-			  sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
-		return -EINVAL;
-	}
-
-	erofs_init_metabuf(&buf, sb);
-	offset = EROFS_SUPER_OFFSET + sbi->sb_size;
-	alg = 0;
-	for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
-		void *data;
-
-		if (!(algs & 1))
-			continue;
-
-		data = erofs_read_metadata(sb, &buf, &offset, &size);
-		if (IS_ERR(data)) {
-			ret = PTR_ERR(data);
-			break;
-		}
+	if (!dsb->u1.available_compr_algs)
+		return 0;
 
-		switch (alg) {
-		case Z_EROFS_COMPRESSION_LZ4:
-			ret = z_erofs_load_lz4_config(sb, dsb, data, size);
-			break;
-		case Z_EROFS_COMPRESSION_LZMA:
-			ret = z_erofs_load_lzma_config(sb, dsb, data, size);
-			break;
-		case Z_EROFS_COMPRESSION_DEFLATE:
-			ret = z_erofs_load_deflate_config(sb, dsb, data, size);
-			break;
-		default:
-			DBG_BUGON(1);
-			ret = -EFAULT;
-		}
-		kfree(data);
-		if (ret)
-			break;
-	}
-	erofs_put_metabuf(&buf);
-	return ret;
-}
-#else
-static int erofs_load_compr_cfgs(struct super_block *sb,
-				 struct erofs_super_block *dsb)
-{
-	if (dsb->u1.available_compr_algs) {
-		erofs_err(sb, "try to load compressed fs when compression is disabled");
-		return -EINVAL;
-	}
-	return 0;
+	erofs_err(sb, "compression disabled, unable to mount compressed EROFS");
+	return -EOPNOTSUPP;
 }
 #endif
 
@@ -227,7 +171,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_fscache *fscache;
 	struct erofs_deviceslot *dis;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	void *ptr;
 
 	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -251,13 +195,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 			return PTR_ERR(fscache);
 		dif->fscache = fscache;
 	} else if (!sbi->devs->flatdev) {
-		bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type,
-					  NULL);
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
-		dif->bdev = bdev;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
-						  NULL, NULL);
+		bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+						sb->s_type, NULL);
+		if (IS_ERR(bdev_handle))
+			return PTR_ERR(bdev_handle);
+		dif->bdev_handle = bdev_handle;
+		dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+				&dif->dax_part_off, NULL, NULL);
 	}
 
 	dif->blocks = le32_to_cpu(dis->blocks);
@@ -406,10 +350,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	/* parse on-disk compression configurations */
-	if (erofs_sb_has_compr_cfgs(sbi))
-		ret = erofs_load_compr_cfgs(sb, dsb);
-	else
-		ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+	ret = z_erofs_parse_cfgs(sb, dsb);
 	if (ret < 0)
 		goto out;
 
@@ -626,6 +567,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
 }
 
 static const struct export_operations erofs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = erofs_fh_to_dentry,
 	.fh_to_parent = erofs_fh_to_parent,
 	.get_parent = erofs_get_parent,
@@ -724,13 +666,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	xa_init(&sbi->managed_pslots);
 #endif
 
-	inode = erofs_iget(sb, ROOT_NID(sbi));
+	inode = erofs_iget(sb, sbi->root_nid);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
 	if (!S_ISDIR(inode->i_mode)) {
 		erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
-			  ROOT_NID(sbi), inode->i_mode);
+			  sbi->root_nid, inode->i_mode);
 		iput(inode);
 		return -EINVAL;
 	}
@@ -760,7 +702,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		return err;
 
-	erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
+	erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
 	return 0;
 }
 
@@ -806,8 +748,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->bdev)
-		blkdev_put(dif->bdev, &erofs_fs_type);
+	if (dif->bdev_handle)
+		bdev_release(dif->bdev_handle);
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index cc6fb9e98899..5dea308764b4 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -77,12 +77,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	struct erofs_workgroup *pre;
 
-	/*
-	 * Bump up before making this visible to others for the XArray in order
-	 * to avoid potential UAF without serialized by xa_lock.
-	 */
-	lockref_get(&grp->lockref);
-
+	DBG_BUGON(grp->lockref.count < 1);
 repeat:
 	xa_lock(&sbi->managed_pslots);
 	pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
@@ -96,7 +91,6 @@ repeat:
 			cond_resched();
 			goto repeat;
 		}
-		lockref_put_return(&grp->lockref);
 		grp = pre;
 	}
 	xa_unlock(&sbi->managed_pslots);
@@ -270,19 +264,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
 	return freed;
 }
 
-static struct shrinker erofs_shrinker_info = {
-	.scan_objects = erofs_shrink_scan,
-	.count_objects = erofs_shrink_count,
-	.seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *erofs_shrinker_info;
 
 int __init erofs_init_shrinker(void)
 {
-	return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
+	erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker");
+	if (!erofs_shrinker_info)
+		return -ENOMEM;
+
+	erofs_shrinker_info->count_objects = erofs_shrink_count;
+	erofs_shrinker_info->scan_objects = erofs_shrink_scan;
+
+	shrinker_register(erofs_shrinker_info);
+
+	return 0;
 }
 
 void erofs_exit_shrinker(void)
 {
-	unregister_shrinker(&erofs_shrinker_info);
+	shrinker_free(erofs_shrinker_info);
 }
 #endif	/* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 09d341675e89..b58316b49a43 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -168,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
 };
 #endif
 
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
 	&erofs_xattr_user_handler,
 	&erofs_xattr_trusted_handler,
 #ifdef CONFIG_EROFS_FS_SECURITY
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index f16283cb8c93..b246cd0e135e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
 {
 	const struct xattr_handler *handler = NULL;
 
-	static const struct xattr_handler *xattr_handler_map[] = {
+	static const struct xattr_handler * const xattr_handler_map[] = {
 		[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
 		[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
 	return xattr_prefix(handler);
 }
 
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
 
 int erofs_xattr_prefixes_init(struct super_block *sb);
 void erofs_xattr_prefixes_cleanup(struct super_block *sb);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 036f610e044b..a7e6847f6f8f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -796,6 +796,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 		return PTR_ERR(pcl);
 
 	spin_lock_init(&pcl->obj.lockref.lock);
+	pcl->obj.lockref.count = 1;	/* one ref for this request */
 	pcl->algorithmformat = map->m_algorithmformat;
 	pcl->length = 0;
 	pcl->partial = true;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1d9a71a0c4c1..2877cc01cff1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -256,10 +256,10 @@ static u64 loop_check_gen = 0;
 static struct eventpoll *inserting_into;
 
 /* Slab cache used to allocate "struct epitem" */
-static struct kmem_cache *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __ro_after_init;
 
 /* Slab cache used to allocate "struct eppoll_entry" */
-static struct kmem_cache *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __ro_after_init;
 
 /*
  * List of files with newly added links, where we may need to limit the number
@@ -271,7 +271,7 @@ struct epitems_head {
 };
 static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
 
-static struct kmem_cache *ephead_cache __read_mostly;
+static struct kmem_cache *ephead_cache __ro_after_init;
 
 static inline void free_ephead(struct epitems_head *head)
 {
diff --git a/fs/exec.c b/fs/exec.c
index 6518e33ea813..4aa19b24f281 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	 * process cleanup to remove whatever mess we made.
 	 */
 	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length, false))
+				       vma, new_start, length, false, true))
 		return -ENOMEM;
 
 	lru_add_drain();
@@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
-	if (old_mm)
-		sync_mm_rss(old_mm);
 
 	ret = down_write_killable(&tsk->signal->exec_update_lock);
 	if (ret)
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index e1586bba6d86..9f9295847a4e 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -287,7 +287,7 @@ get_new:
 
 	mutex_unlock(&EXFAT_SB(sb)->s_lock);
 	if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum,
-			(de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
+			(de.attr & EXFAT_ATTR_SUBDIR) ? DT_DIR : DT_REG))
 		goto out;
 	ctx->pos = cpos;
 	goto get_new;
@@ -359,7 +359,7 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
 		if (ep->type == EXFAT_VOLUME)
 			return TYPE_VOLUME;
 		if (ep->type == EXFAT_FILE) {
-			if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR)
+			if (le16_to_cpu(ep->dentry.file.attr) & EXFAT_ATTR_SUBDIR)
 				return TYPE_DIR;
 			return TYPE_FILE;
 		}
@@ -410,19 +410,21 @@ static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type)
 		ep->type = EXFAT_VOLUME;
 	} else if (type == TYPE_DIR) {
 		ep->type = EXFAT_FILE;
-		ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR);
+		ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_SUBDIR);
 	} else if (type == TYPE_FILE) {
 		ep->type = EXFAT_FILE;
-		ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE);
+		ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_ARCHIVE);
 	}
 }
 
 static void exfat_init_stream_entry(struct exfat_dentry *ep,
-		unsigned char flags, unsigned int start_clu,
-		unsigned long long size)
+		unsigned int start_clu, unsigned long long size)
 {
 	exfat_set_entry_type(ep, TYPE_STREAM);
-	ep->dentry.stream.flags = flags;
+	if (size == 0)
+		ep->dentry.stream.flags = ALLOC_FAT_CHAIN;
+	else
+		ep->dentry.stream.flags = ALLOC_NO_FAT_CHAIN;
 	ep->dentry.stream.start_clu = cpu_to_le32(start_clu);
 	ep->dentry.stream.valid_size = cpu_to_le64(size);
 	ep->dentry.stream.size = cpu_to_le64(size);
@@ -488,9 +490,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
 	if (!ep)
 		return -EIO;
 
-	exfat_init_stream_entry(ep,
-		(type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN,
-		start_clu, size);
+	exfat_init_stream_entry(ep, start_clu, size);
 	exfat_update_bh(bh, IS_DIRSYNC(inode));
 	brelse(bh);
 
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f55498e5c23d..a7a2c35d74fb 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -234,6 +234,8 @@ struct exfat_mount_options {
 		 discard:1, /* Issue discard requests on deletions */
 		 keep_last_dots:1; /* Keep trailing periods in paths */
 	int time_offset; /* Offset of timestamps from UTC (in minutes) */
+	/* Support creating zero-size directory, default: false */
+	bool zero_size_dir;
 };
 
 /*
@@ -357,10 +359,10 @@ static inline int exfat_mode_can_hold_ro(struct inode *inode)
 static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi,
 		unsigned short attr, mode_t mode)
 {
-	if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR))
+	if ((attr & EXFAT_ATTR_READONLY) && !(attr & EXFAT_ATTR_SUBDIR))
 		mode &= ~0222;
 
-	if (attr & ATTR_SUBDIR)
+	if (attr & EXFAT_ATTR_SUBDIR)
 		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
 
 	return (mode & ~sbi->options.fs_fmask) | S_IFREG;
@@ -372,18 +374,18 @@ static inline unsigned short exfat_make_attr(struct inode *inode)
 	unsigned short attr = EXFAT_I(inode)->attr;
 
 	if (S_ISDIR(inode->i_mode))
-		attr |= ATTR_SUBDIR;
+		attr |= EXFAT_ATTR_SUBDIR;
 	if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222))
-		attr |= ATTR_READONLY;
+		attr |= EXFAT_ATTR_READONLY;
 	return attr;
 }
 
 static inline void exfat_save_attr(struct inode *inode, unsigned short attr)
 {
 	if (exfat_mode_can_hold_ro(inode))
-		EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY);
+		EXFAT_I(inode)->attr = attr & (EXFAT_ATTR_RWMASK | EXFAT_ATTR_READONLY);
 	else
-		EXFAT_I(inode)->attr = attr & ATTR_RWMASK;
+		EXFAT_I(inode)->attr = attr & EXFAT_ATTR_RWMASK;
 }
 
 static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi,
@@ -549,6 +551,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
 		u8 tz, __le16 time, __le16 date, u8 time_cs);
 void exfat_truncate_atime(struct timespec64 *ts);
+void exfat_truncate_inode_atime(struct inode *inode);
 void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
 		u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
 u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 0ece2e43cf49..971a1ccd0e89 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -64,15 +64,16 @@
 #define CS_DEFAULT		2
 
 /* file attributes */
-#define ATTR_READONLY		0x0001
-#define ATTR_HIDDEN		0x0002
-#define ATTR_SYSTEM		0x0004
-#define ATTR_VOLUME		0x0008
-#define ATTR_SUBDIR		0x0010
-#define ATTR_ARCHIVE		0x0020
-
-#define ATTR_RWMASK		(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \
-				 ATTR_SUBDIR | ATTR_ARCHIVE)
+#define EXFAT_ATTR_READONLY	0x0001
+#define EXFAT_ATTR_HIDDEN	0x0002
+#define EXFAT_ATTR_SYSTEM	0x0004
+#define EXFAT_ATTR_VOLUME	0x0008
+#define EXFAT_ATTR_SUBDIR	0x0010
+#define EXFAT_ATTR_ARCHIVE	0x0020
+
+#define EXFAT_ATTR_RWMASK	(EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM | \
+				 EXFAT_ATTR_VOLUME | EXFAT_ATTR_SUBDIR | \
+				 EXFAT_ATTR_ARCHIVE)
 
 #define BOOTSEC_JUMP_BOOT_LEN		3
 #define BOOTSEC_FS_NAME_LEN		8
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 32395ef686a2..bfdfafe00993 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -8,6 +8,9 @@
 #include <linux/cred.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
+#include <linux/msdos_fs.h>
 
 #include "exfat_raw.h"
 #include "exfat_fs.h"
@@ -22,7 +25,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
 	if (err)
 		return err;
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 
 	if (!IS_SYNC(inode))
@@ -144,7 +147,7 @@ int __exfat_truncate(struct inode *inode)
 	}
 
 	if (ei->type == TYPE_FILE)
-		ei->attr |= ATTR_ARCHIVE;
+		ei->attr |= EXFAT_ATTR_ARCHIVE;
 
 	/*
 	 * update the directory entry
@@ -290,10 +293,10 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	}
 
 	if (attr->ia_valid & ATTR_SIZE)
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
-	exfat_truncate_atime(&inode->i_atime);
+	exfat_truncate_inode_atime(inode);
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		error = exfat_block_truncate_page(inode, attr->ia_size);
@@ -316,6 +319,93 @@ out:
 	return error;
 }
 
+/*
+ * modified ioctls from fat/file.c by Welmer Almesberger
+ */
+static int exfat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
+{
+	u32 attr;
+
+	inode_lock_shared(inode);
+	attr = exfat_make_attr(inode);
+	inode_unlock_shared(inode);
+
+	return put_user(attr, user_attr);
+}
+
+static int exfat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+	struct inode *inode = file_inode(file);
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+	int is_dir = S_ISDIR(inode->i_mode);
+	u32 attr, oldattr;
+	struct iattr ia;
+	int err;
+
+	err = get_user(attr, user_attr);
+	if (err)
+		goto out;
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto out;
+	inode_lock(inode);
+
+	oldattr = exfat_make_attr(inode);
+
+	/*
+	 * Mask attributes so we don't set reserved fields.
+	 */
+	attr &= (EXFAT_ATTR_READONLY | EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM |
+		 EXFAT_ATTR_ARCHIVE);
+	attr |= (is_dir ? EXFAT_ATTR_SUBDIR : 0);
+
+	/* Equivalent to a chmod() */
+	ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+	ia.ia_ctime = current_time(inode);
+	if (is_dir)
+		ia.ia_mode = exfat_make_mode(sbi, attr, 0777);
+	else
+		ia.ia_mode = exfat_make_mode(sbi, attr, 0666 | (inode->i_mode & 0111));
+
+	/* The root directory has no attributes */
+	if (inode->i_ino == EXFAT_ROOT_INO && attr != EXFAT_ATTR_SUBDIR) {
+		err = -EINVAL;
+		goto out_unlock_inode;
+	}
+
+	if (((attr | oldattr) & EXFAT_ATTR_SYSTEM) &&
+	    !capable(CAP_LINUX_IMMUTABLE)) {
+		err = -EPERM;
+		goto out_unlock_inode;
+	}
+
+	/*
+	 * The security check is questionable...  We single
+	 * out the RO attribute for checking by the security
+	 * module, just because it maps to a file mode.
+	 */
+	err = security_inode_setattr(file_mnt_idmap(file),
+				     file->f_path.dentry, &ia);
+	if (err)
+		goto out_unlock_inode;
+
+	/* This MUST be done before doing anything irreversible... */
+	err = exfat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia);
+	if (err)
+		goto out_unlock_inode;
+
+	fsnotify_change(file->f_path.dentry, ia.ia_valid);
+
+	exfat_save_attr(inode, attr);
+	mark_inode_dirty(inode);
+out_unlock_inode:
+	inode_unlock(inode);
+	mnt_drop_write_file(file);
+out:
+	return err;
+}
+
 static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
 {
 	struct fstrim_range range;
@@ -346,8 +436,13 @@ static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
 long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
+	u32 __user *user_attr = (u32 __user *)arg;
 
 	switch (cmd) {
+	case FAT_IOCTL_GET_ATTRIBUTES:
+		return exfat_ioctl_get_attributes(inode, user_attr);
+	case FAT_IOCTL_SET_ATTRIBUTES:
+		return exfat_ioctl_set_attributes(filp, user_attr);
 	case FITRIM:
 		return exfat_ioctl_fitrim(inode, arg);
 	default:
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 13329baeafbc..e7ff58b8e68c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 	bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+	struct timespec64 ts;
 
 	if (inode->i_ino == EXFAT_ROOT_INO)
 		return 0;
@@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync)
 			&ep->dentry.file.create_time,
 			&ep->dentry.file.create_date,
 			&ep->dentry.file.create_time_cs);
-	exfat_set_entry_time(sbi, &inode->i_mtime,
-			&ep->dentry.file.modify_tz,
-			&ep->dentry.file.modify_time,
-			&ep->dentry.file.modify_date,
-			&ep->dentry.file.modify_time_cs);
-	exfat_set_entry_time(sbi, &inode->i_atime,
-			&ep->dentry.file.access_tz,
-			&ep->dentry.file.access_time,
-			&ep->dentry.file.access_date,
-			NULL);
+	ts = inode_get_mtime(inode);
+	exfat_set_entry_time(sbi, &ts,
+			     &ep->dentry.file.modify_tz,
+			     &ep->dentry.file.modify_time,
+			     &ep->dentry.file.modify_date,
+			     &ep->dentry.file.modify_time_cs);
+	ts = inode_get_atime(inode);
+	exfat_set_entry_time(sbi, &ts,
+			     &ep->dentry.file.access_tz,
+			     &ep->dentry.file.access_time,
+			     &ep->dentry.file.access_date,
+			     NULL);
 
 	/* File size should be zero if there is no cluster allocated */
 	on_disk_size = i_size_read(inode);
@@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
 
 	if (to > i_size_read(inode)) {
 		truncate_pagecache(inode, i_size_read(inode));
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		exfat_truncate(inode);
 	}
 }
@@ -397,9 +400,9 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
 	if (err < len)
 		exfat_write_failed(mapping, pos+len);
 
-	if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
-		inode->i_mtime = inode_set_ctime_current(inode);
-		ei->attr |= ATTR_ARCHIVE;
+	if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) {
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+		ei->attr |= EXFAT_ATTR_ARCHIVE;
 		mark_inode_dirty(inode);
 	}
 
@@ -547,7 +550,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
 	inode_inc_iversion(inode);
 	inode->i_generation = get_random_u32();
 
-	if (info->attr & ATTR_SUBDIR) { /* directory */
+	if (info->attr & EXFAT_ATTR_SUBDIR) { /* directory */
 		inode->i_generation &= ~1;
 		inode->i_mode = exfat_make_mode(sbi, info->attr, 0777);
 		inode->i_op = &exfat_dir_inode_operations;
@@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
 	exfat_save_attr(inode, info->attr);
 
 	inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
-	inode->i_mtime = info->mtime;
+	inode_set_mtime_to_ts(inode, info->mtime);
 	inode_set_ctime_to_ts(inode, info->mtime);
 	ei->i_crtime = info->crtime;
-	inode->i_atime = info->atime;
+	inode_set_atime_to_ts(inode, info->atime);
 
 	return 0;
 }
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 2e1a1a6b1021..fa8459828046 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts)
 	ts->tv_nsec = 0;
 }
 
+void exfat_truncate_inode_atime(struct inode *inode)
+{
+	struct timespec64 atime = inode_get_atime(inode);
+
+	exfat_truncate_atime(&atime);
+	inode_set_atime_to_ts(inode, atime);
+}
+
 u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type)
 {
 	int i;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 1b9f587f6cca..5d737e0b639a 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -351,14 +351,20 @@ static int exfat_find_empty_entry(struct inode *inode,
 		if (exfat_check_max_dentries(inode))
 			return -ENOSPC;
 
-		/* we trust p_dir->size regardless of FAT type */
-		if (exfat_find_last_cluster(sb, p_dir, &last_clu))
-			return -EIO;
-
 		/*
 		 * Allocate new cluster to this directory
 		 */
-		exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags);
+		if (ei->start_clu != EXFAT_EOF_CLUSTER) {
+			/* we trust p_dir->size regardless of FAT type */
+			if (exfat_find_last_cluster(sb, p_dir, &last_clu))
+				return -EIO;
+
+			exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags);
+		} else {
+			/* This directory is empty */
+			exfat_chain_set(&clu, EXFAT_EOF_CLUSTER, 0,
+					ALLOC_NO_FAT_CHAIN);
+		}
 
 		/* allocate a cluster */
 		ret = exfat_alloc_cluster(inode, 1, &clu, IS_DIRSYNC(inode));
@@ -368,6 +374,11 @@ static int exfat_find_empty_entry(struct inode *inode,
 		if (exfat_zeroed_cluster(inode, clu.dir))
 			return -EIO;
 
+		if (ei->start_clu == EXFAT_EOF_CLUSTER) {
+			ei->start_clu = clu.dir;
+			p_dir->dir = clu.dir;
+		}
+
 		/* append to the FAT chain */
 		if (clu.flags != p_dir->flags) {
 			/* no-fat-chain bit is disabled,
@@ -507,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 		goto out;
 	}
 
-	if (type == TYPE_DIR) {
+	if (type == TYPE_DIR && !sbi->options.zero_size_dir) {
 		ret = exfat_alloc_new_dir(inode, &clu);
 		if (ret)
 			goto out;
@@ -534,13 +545,16 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 	info->type = type;
 
 	if (type == TYPE_FILE) {
-		info->attr = ATTR_ARCHIVE;
+		info->attr = EXFAT_ATTR_ARCHIVE;
 		info->start_clu = EXFAT_EOF_CLUSTER;
 		info->size = 0;
 		info->num_subdirs = 0;
 	} else {
-		info->attr = ATTR_SUBDIR;
-		info->start_clu = start_clu;
+		info->attr = EXFAT_ATTR_SUBDIR;
+		if (sbi->options.zero_size_dir)
+			info->start_clu = EXFAT_EOF_CLUSTER;
+		else
+			info->start_clu = start_clu;
 		info->size = clu_size;
 		info->num_subdirs = EXFAT_MIN_SUBDIR;
 	}
@@ -569,7 +583,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -582,8 +596,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
+
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -645,7 +660,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
 	info->type = exfat_get_entry_type(ep);
 	info->attr = le16_to_cpu(ep->dentry.file.attr);
 	info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
-	if ((info->type == TYPE_FILE) && (info->size == 0)) {
+	if (info->size == 0) {
 		info->flags = ALLOC_NO_FAT_CHAIN;
 		info->start_clu = EXFAT_EOF_CLUSTER;
 	} else {
@@ -816,16 +831,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
 	ei->dir.dir = DIR_DELETED;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
-	exfat_truncate_atime(&dir->i_atime);
+	simple_inode_init_ts(dir);
+	exfat_truncate_inode_atime(dir);
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
 		mark_inode_dirty(dir);
 
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	exfat_unhash_inode(inode);
 	exfat_d_version_set(dentry, inode_query_iversion(dir));
 unlock:
@@ -851,7 +866,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -865,8 +880,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 		goto unlock;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
 
 	d_instantiate(dentry, inode);
@@ -888,6 +903,9 @@ static int exfat_check_dir_empty(struct super_block *sb,
 
 	dentries_per_clu = sbi->dentries_per_clu;
 
+	if (p_dir->dir == EXFAT_EOF_CLUSTER)
+		return 0;
+
 	exfat_chain_dup(&clu, p_dir);
 
 	while (clu.dir != EXFAT_EOF_CLUSTER) {
@@ -977,8 +995,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 	ei->dir.dir = DIR_DELETED;
 
 	inode_inc_iversion(dir);
-	dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
-	exfat_truncate_atime(&dir->i_atime);
+	simple_inode_init_ts(dir);
+	exfat_truncate_inode_atime(dir);
 	if (IS_DIRSYNC(dir))
 		exfat_sync_inode(dir);
 	else
@@ -986,8 +1004,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 	drop_nlink(dir);
 
 	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	exfat_unhash_inode(inode);
 	exfat_d_version_set(dentry, inode_query_iversion(dir));
 unlock:
@@ -1032,8 +1050,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
 
 		*epnew = *epold;
 		if (exfat_get_entry_type(epnew) == TYPE_FILE) {
-			epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
-			ei->attr |= ATTR_ARCHIVE;
+			epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+			ei->attr |= EXFAT_ATTR_ARCHIVE;
 		}
 		exfat_update_bh(new_bh, sync);
 		brelse(old_bh);
@@ -1064,8 +1082,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
 		ei->entry = newentry;
 	} else {
 		if (exfat_get_entry_type(epold) == TYPE_FILE) {
-			epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
-			ei->attr |= ATTR_ARCHIVE;
+			epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+			ei->attr |= EXFAT_ATTR_ARCHIVE;
 		}
 		exfat_update_bh(old_bh, sync);
 		brelse(old_bh);
@@ -1113,8 +1131,8 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
 
 	*epnew = *epmov;
 	if (exfat_get_entry_type(epnew) == TYPE_FILE) {
-		epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE);
-		ei->attr |= ATTR_ARCHIVE;
+		epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
+		ei->attr |= EXFAT_ATTR_ARCHIVE;
 	}
 	exfat_update_bh(new_bh, IS_DIRSYNC(inode));
 	brelse(mov_bh);
@@ -1255,7 +1273,8 @@ static int __exfat_rename(struct inode *old_parent_inode,
 		}
 
 		/* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
-		if (new_entry_type == TYPE_DIR) {
+		if (new_entry_type == TYPE_DIR &&
+		    new_ei->start_clu != EXFAT_EOF_CLUSTER) {
 			/* new_ei, new_clu_to_free */
 			struct exfat_chain new_clu_to_free;
 
@@ -1312,7 +1331,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
 	inode_inc_iversion(new_dir);
 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 	EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
-	exfat_truncate_atime(&new_dir->i_atime);
+	exfat_truncate_inode_atime(new_dir);
 	if (IS_DIRSYNC(new_dir))
 		exfat_sync_inode(new_dir);
 	else
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 2778bd9b631e..d9d4fa91010b 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -165,6 +165,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",sys_tz");
 	else if (opts->time_offset)
 		seq_printf(m, ",time_offset=%d", opts->time_offset);
+	if (opts->zero_size_dir)
+		seq_puts(m, ",zero_size_dir");
 	return 0;
 }
 
@@ -209,6 +211,7 @@ enum {
 	Opt_keep_last_dots,
 	Opt_sys_tz,
 	Opt_time_offset,
+	Opt_zero_size_dir,
 
 	/* Deprecated options */
 	Opt_utf8,
@@ -237,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
 	fsparam_flag("keep_last_dots",		Opt_keep_last_dots),
 	fsparam_flag("sys_tz",			Opt_sys_tz),
 	fsparam_s32("time_offset",		Opt_time_offset),
+	fsparam_flag("zero_size_dir",		Opt_zero_size_dir),
 	__fsparam(NULL, "utf8",			Opt_utf8, fs_param_deprecated,
 		  NULL),
 	__fsparam(NULL, "debug",		Opt_debug, fs_param_deprecated,
@@ -305,6 +309,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			return -EINVAL;
 		opts->time_offset = result.int_32;
 		break;
+	case Opt_zero_size_dir:
+		opts->zero_size_dir = true;
+		break;
 	case Opt_utf8:
 	case Opt_debug:
 	case Opt_namecase:
@@ -360,7 +367,7 @@ static int exfat_read_root(struct inode *inode)
 	inode->i_gid = sbi->options.fs_gid;
 	inode_inc_iversion(inode);
 	inode->i_generation = 0;
-	inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777);
+	inode->i_mode = exfat_make_mode(sbi, EXFAT_ATTR_SUBDIR, 0777);
 	inode->i_op = &exfat_dir_inode_operations;
 	inode->i_fop = &exfat_dir_operations;
 
@@ -369,9 +376,9 @@ static int exfat_read_root(struct inode *inode)
 	ei->i_size_aligned = i_size_read(inode);
 	ei->i_size_ondisk = i_size_read(inode);
 
-	exfat_save_attr(inode, ATTR_SUBDIR);
-	inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
-	exfat_truncate_atime(&inode->i_atime);
+	exfat_save_attr(inode, EXFAT_ATTR_SUBDIR);
+	ei->i_crtime = simple_inode_init_ts(inode);
+	exfat_truncate_inode_atime(inode);
 	return 0;
 }
 
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c20704aa21b3..3ae0154c5680 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -342,43 +342,30 @@ out:
 	return error;
 }
 
+#define FILEID_INO64_GEN_LEN 3
+
 /**
- * export_encode_fh - default export_operations->encode_fh function
+ * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id
  * @inode:   the object to encode
  * @fid:     where to store the file handle fragment
- * @max_len: maximum length to store there
- * @parent:  parent directory inode, if wanted
+ * @max_len: maximum length to store there (in 4 byte units)
  *
- * This default encode_fh function assumes that the 32 inode number
- * is suitable for locating an inode, and that the generation number
- * can be used to check that it is still valid.  It places them in the
- * filehandle fragment where export_decode_fh expects to find them.
+ * This generic function is used to encode a non-decodeable file id for
+ * fanotify for filesystems that do not support NFS export.
  */
-static int export_encode_fh(struct inode *inode, struct fid *fid,
-		int *max_len, struct inode *parent)
+static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid,
+				     int *max_len)
 {
-	int len = *max_len;
-	int type = FILEID_INO32_GEN;
-
-	if (parent && (len < 4)) {
-		*max_len = 4;
-		return FILEID_INVALID;
-	} else if (len < 2) {
-		*max_len = 2;
+	if (*max_len < FILEID_INO64_GEN_LEN) {
+		*max_len = FILEID_INO64_GEN_LEN;
 		return FILEID_INVALID;
 	}
 
-	len = 2;
-	fid->i32.ino = inode->i_ino;
-	fid->i32.gen = inode->i_generation;
-	if (parent) {
-		fid->i32.parent_ino = parent->i_ino;
-		fid->i32.parent_gen = parent->i_generation;
-		len = 4;
-		type = FILEID_INO32_GEN_PARENT;
-	}
-	*max_len = len;
-	return type;
+	fid->i64.ino = inode->i_ino;
+	fid->i64.gen = inode->i_generation;
+	*max_len = FILEID_INO64_GEN_LEN;
+
+	return FILEID_INO64_GEN;
 }
 
 /**
@@ -396,17 +383,13 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
 {
 	const struct export_operations *nop = inode->i_sb->s_export_op;
 
-	/*
-	 * If a decodeable file handle was requested, we need to make sure that
-	 * filesystem can decode file handles.
-	 */
-	if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry)
+	if (!exportfs_can_encode_fh(nop, flags))
 		return -EOPNOTSUPP;
 
-	if (nop && nop->encode_fh)
-		return nop->encode_fh(inode, fid->raw, max_len, parent);
+	if (!nop && (flags & EXPORT_FH_FID))
+		return exportfs_encode_ino64_fid(inode, fid, max_len);
 
-	return export_encode_fh(inode, fid, max_len, parent);
+	return nop->encode_fh(inode, fid->raw, max_len, parent);
 }
 EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
 
@@ -456,7 +439,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
-	if (!nop || !nop->fh_to_dentry)
+	if (!exportfs_can_decode_fh(nop))
 		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (IS_ERR_OR_NULL(result))
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index b335f17f682f..4fb155b5a958 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,34 +81,34 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = folio->mapping;
 	struct inode *dir = mapping->host;
 
 	inode_inc_iversion(dir);
-	block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
 
 	if (pos+len > dir->i_size) {
 		i_size_write(dir, pos+len);
 		mark_inode_dirty(dir);
 	}
-	unlock_page(page);
+	folio_unlock(folio);
 }
 
-static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
+static bool ext2_check_folio(struct folio *folio, int quiet, char *kaddr)
 {
-	struct inode *dir = page->mapping->host;
+	struct inode *dir = folio->mapping->host;
 	struct super_block *sb = dir->i_sb;
 	unsigned chunk_size = ext2_chunk_size(dir);
 	u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_SIZE;
+	unsigned limit = folio_size(folio);
 	ext2_dirent *p;
 	char *error;
 
-	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_MASK;
+	if (dir->i_size < folio_pos(folio) + limit) {
+		limit = offset_in_folio(folio, dir->i_size);
 		if (limit & (chunk_size - 1))
 			goto Ebadsize;
 		if (!limit)
@@ -132,7 +132,7 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
 	if (offs != limit)
 		goto Eend;
 out:
-	SetPageChecked(page);
+	folio_set_checked(folio);
 	return true;
 
 	/* Too bad, we had an error */
@@ -160,51 +160,52 @@ Einumber:
 bad_entry:
 	if (!quiet)
 		ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
-			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-			dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
+			"offset=%llu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error, folio_pos(folio) + offs,
 			(unsigned long) le32_to_cpu(p->inode),
 			rec_len, p->name_len);
 	goto fail;
 Eend:
 	if (!quiet) {
 		p = (ext2_dirent *)(kaddr + offs);
-		ext2_error(sb, "ext2_check_page",
+		ext2_error(sb, "ext2_check_folio",
 			"entry in directory #%lu spans the page boundary"
-			"offset=%lu, inode=%lu",
-			dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
+			"offset=%llu, inode=%lu",
+			dir->i_ino, folio_pos(folio) + offs,
 			(unsigned long) le32_to_cpu(p->inode));
 	}
 fail:
-	SetPageError(page);
+	folio_set_error(folio);
 	return false;
 }
 
 /*
- * Calls to ext2_get_page()/ext2_put_page() must be nested according to the
- * rules documented in kmap_local_page()/kunmap_local().
+ * Calls to ext2_get_folio()/folio_release_kmap() must be nested according
+ * to the rules documented in kmap_local_folio()/kunmap_local().
  *
- * NOTE: ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page()
- * and should be treated as a call to ext2_get_page() for nesting purposes.
+ * NOTE: ext2_find_entry() and ext2_dotdot() act as a call
+ * to folio_release_kmap() and should be treated as a call to
+ * folio_release_kmap() for nesting purposes.
  */
-static void *ext2_get_page(struct inode *dir, unsigned long n,
-				   int quiet, struct page **page)
+static void *ext2_get_folio(struct inode *dir, unsigned long n,
+				   int quiet, struct folio **foliop)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct folio *folio = read_mapping_folio(mapping, n, NULL);
-	void *page_addr;
+	void *kaddr;
 
 	if (IS_ERR(folio))
 		return ERR_CAST(folio);
-	page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
+	kaddr = kmap_local_folio(folio, 0);
 	if (unlikely(!folio_test_checked(folio))) {
-		if (!ext2_check_page(&folio->page, quiet, page_addr))
+		if (!ext2_check_folio(folio, quiet, kaddr))
 			goto fail;
 	}
-	*page = &folio->page;
-	return page_addr;
+	*foliop = folio;
+	return kaddr;
 
 fail:
-	ext2_put_page(&folio->page, page_addr);
+	folio_release_kmap(folio, kaddr);
 	return ERR_PTR(-EIO);
 }
 
@@ -274,8 +275,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 
 	for ( ; n < npages; n++, offset = 0) {
 		ext2_dirent *de;
-		struct page *page;
-		char *kaddr = ext2_get_page(inode, n, 0, &page);
+		struct folio *folio;
+		char *kaddr = ext2_get_folio(inode, n, 0, &folio);
 		char *limit;
 
 		if (IS_ERR(kaddr)) {
@@ -299,7 +300,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 			if (de->rec_len == 0) {
 				ext2_error(sb, __func__,
 					"zero-length directory entry");
-				ext2_put_page(page, de);
+				folio_release_kmap(folio, de);
 				return -EIO;
 			}
 			if (de->inode) {
@@ -311,13 +312,13 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 				if (!dir_emit(ctx, de->name, de->name_len,
 						le32_to_cpu(de->inode),
 						d_type)) {
-					ext2_put_page(page, de);
+					folio_release_kmap(folio, de);
 					return 0;
 				}
 			}
 			ctx->pos += ext2_rec_len_from_disk(de->rec_len);
 		}
-		ext2_put_page(page, kaddr);
+		folio_release_kmap(folio, kaddr);
 	}
 	return 0;
 }
@@ -330,38 +331,35 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
  * and the entry itself. Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  *
- * On Success ext2_put_page() should be called on *res_page.
+ * On Success folio_release_kmap() should be called on *foliop.
  *
- * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to
- * the rules documented in kmap_local_page()/kunmap_local().
+ * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested
+ * according to the rules documented in kmap_local_folio()/kunmap_local().
  *
- * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and
- * should be treated as a call to ext2_get_page() for nesting purposes.
+ * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio()
+ * and should be treated as a call to ext2_get_folio() for nesting
+ * purposes.
  */
 struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
-			const struct qstr *child, struct page **res_page)
+			const struct qstr *child, struct folio **foliop)
 {
 	const char *name = child->name;
 	int namelen = child->len;
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
-	struct page *page = NULL;
 	struct ext2_inode_info *ei = EXT2_I(dir);
 	ext2_dirent * de;
 
 	if (npages == 0)
 		goto out;
 
-	/* OFFSET_CACHE */
-	*res_page = NULL;
-
 	start = ei->i_dir_start_lookup;
 	if (start >= npages)
 		start = 0;
 	n = start;
 	do {
-		char *kaddr = ext2_get_page(dir, n, 0, &page);
+		char *kaddr = ext2_get_folio(dir, n, 0, foliop);
 		if (IS_ERR(kaddr))
 			return ERR_CAST(kaddr);
 
@@ -371,18 +369,18 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
 			if (de->rec_len == 0) {
 				ext2_error(dir->i_sb, __func__,
 					"zero-length directory entry");
-				ext2_put_page(page, de);
+				folio_release_kmap(*foliop, de);
 				goto out;
 			}
 			if (ext2_match(namelen, name, de))
 				goto found;
 			de = ext2_next_entry(de);
 		}
-		ext2_put_page(page, kaddr);
+		folio_release_kmap(*foliop, kaddr);
 
 		if (++n >= npages)
 			n = 0;
-		/* next page is past the blocks we've got */
+		/* next folio is past the blocks we've got */
 		if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
 			ext2_error(dir->i_sb, __func__,
 				"dir %lu size %lld exceeds block count %llu",
@@ -395,7 +393,6 @@ out:
 	return ERR_PTR(-ENOENT);
 
 found:
-	*res_page = page;
 	ei->i_dir_start_lookup = n;
 	return de;
 }
@@ -404,17 +401,18 @@ found:
  * Return the '..' directory entry and the page in which the entry was found
  * (as a parameter - p).
  *
- * On Success ext2_put_page() should be called on *p.
+ * On Success folio_release_kmap() should be called on *foliop.
  *
- * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to
- * the rules documented in kmap_local_page()/kunmap_local().
+ * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested
+ * according to the rules documented in kmap_local_folio()/kunmap_local().
  *
- * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and
- * should be treated as a call to ext2_get_page() for nesting purposes.
+ * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio()
+ * and should be treated as a call to ext2_get_folio() for nesting
+ * purposes.
  */
-struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p)
+struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop)
 {
-	ext2_dirent *de = ext2_get_page(dir, 0, 0, p);
+	ext2_dirent *de = ext2_get_folio(dir, 0, 0, foliop);
 
 	if (!IS_ERR(de))
 		return ext2_next_entry(de);
@@ -424,23 +422,22 @@ struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p)
 int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino)
 {
 	struct ext2_dir_entry_2 *de;
-	struct page *page;
-	
-	de = ext2_find_entry(dir, child, &page);
+	struct folio *folio;
+
+	de = ext2_find_entry(dir, child, &folio);
 	if (IS_ERR(de))
 		return PTR_ERR(de);
 
 	*ino = le32_to_cpu(de->inode);
-	ext2_put_page(page, de);
+	folio_release_kmap(folio, de);
 	return 0;
 }
 
-static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
 {
-	return __block_write_begin(page, pos, len, ext2_get_block);
+	return __block_write_begin(&folio->page, pos, len, ext2_get_block);
 }
 
-
 static int ext2_handle_dirsync(struct inode *dir)
 {
 	int err;
@@ -452,23 +449,23 @@ static int ext2_handle_dirsync(struct inode *dir)
 }
 
 int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-		struct page *page, struct inode *inode, bool update_times)
+		struct folio *folio, struct inode *inode, bool update_times)
 {
-	loff_t pos = page_offset(page) + offset_in_page(de);
+	loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
 	unsigned len = ext2_rec_len_from_disk(de->rec_len);
 	int err;
 
-	lock_page(page);
-	err = ext2_prepare_chunk(page, pos, len);
+	folio_lock(folio);
+	err = ext2_prepare_chunk(folio, pos, len);
 	if (err) {
-		unlock_page(page);
+		folio_unlock(folio);
 		return err;
 	}
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type(de, inode);
-	ext2_commit_chunk(page, pos, len);
+	ext2_commit_chunk(folio, pos, len);
 	if (update_times)
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 	return ext2_handle_dirsync(dir);
@@ -485,7 +482,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	unsigned chunk_size = ext2_chunk_size(dir);
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 	ext2_dirent * de;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
@@ -494,19 +491,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 
 	/*
 	 * We take care of directory expansion in the same loop.
-	 * This code plays outside i_size, so it locks the page
+	 * This code plays outside i_size, so it locks the folio
 	 * to protect that region.
 	 */
 	for (n = 0; n <= npages; n++) {
-		char *kaddr = ext2_get_page(dir, n, 0, &page);
+		char *kaddr = ext2_get_folio(dir, n, 0, &folio);
 		char *dir_end;
 
 		if (IS_ERR(kaddr))
 			return PTR_ERR(kaddr);
-		lock_page(page);
+		folio_lock(folio);
 		dir_end = kaddr + ext2_last_byte(dir, n);
 		de = (ext2_dirent *)kaddr;
-		kaddr += PAGE_SIZE - reclen;
+		kaddr += folio_size(folio) - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				/* We hit i_size */
@@ -533,15 +530,15 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 				goto got_it;
 			de = (ext2_dirent *) ((char *) de + rec_len);
 		}
-		unlock_page(page);
-		ext2_put_page(page, kaddr);
+		folio_unlock(folio);
+		folio_release_kmap(folio, kaddr);
 	}
 	BUG();
 	return -EINVAL;
 
 got_it:
-	pos = page_offset(page) + offset_in_page(de);
-	err = ext2_prepare_chunk(page, pos, rec_len);
+	pos = folio_pos(folio) + offset_in_folio(folio, de);
+	err = ext2_prepare_chunk(folio, pos, rec_len);
 	if (err)
 		goto out_unlock;
 	if (de->inode) {
@@ -554,17 +551,17 @@ got_it:
 	memcpy(de->name, name, namelen);
 	de->inode = cpu_to_le32(inode->i_ino);
 	ext2_set_de_type (de, inode);
-	ext2_commit_chunk(page, pos, rec_len);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	ext2_commit_chunk(folio, pos, rec_len);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 	err = ext2_handle_dirsync(dir);
 	/* OFFSET_CACHE */
 out_put:
-	ext2_put_page(page, de);
+	folio_release_kmap(folio, de);
 	return err;
 out_unlock:
-	unlock_page(page);
+	folio_unlock(folio);
 	goto out_put;
 }
 
@@ -572,18 +569,21 @@ out_unlock:
  * ext2_delete_entry deletes a directory entry by merging it with the
  * previous entry. Page is up-to-date.
  */
-int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
+int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio)
 {
-	struct inode *inode = page->mapping->host;
-	char *kaddr = (char *)((unsigned long)dir & PAGE_MASK);
-	unsigned from = offset_in_page(dir) & ~(ext2_chunk_size(inode)-1);
-	unsigned to = offset_in_page(dir) +
-				ext2_rec_len_from_disk(dir->rec_len);
+	struct inode *inode = folio->mapping->host;
+	size_t from, to;
+	char *kaddr;
 	loff_t pos;
-	ext2_dirent *pde = NULL;
-	ext2_dirent *de = (ext2_dirent *)(kaddr + from);
+	ext2_dirent *de, *pde = NULL;
 	int err;
 
+	from = offset_in_folio(folio, dir);
+	to = from + ext2_rec_len_from_disk(dir->rec_len);
+	kaddr = (char *)dir - from;
+	from &= ~(ext2_chunk_size(inode)-1);
+	de = (ext2_dirent *)(kaddr + from);
+
 	while ((char*)de < (char*)dir) {
 		if (de->rec_len == 0) {
 			ext2_error(inode->i_sb, __func__,
@@ -594,19 +594,19 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
 		de = ext2_next_entry(de);
 	}
 	if (pde)
-		from = offset_in_page(pde);
-	pos = page_offset(page) + from;
-	lock_page(page);
-	err = ext2_prepare_chunk(page, pos, to - from);
+		from = offset_in_folio(folio, pde);
+	pos = folio_pos(folio) + from;
+	folio_lock(folio);
+	err = ext2_prepare_chunk(folio, pos, to - from);
 	if (err) {
-		unlock_page(page);
+		folio_unlock(folio);
 		return err;
 	}
 	if (pde)
 		pde->rec_len = ext2_rec_len_to_disk(to - from);
 	dir->inode = 0;
-	ext2_commit_chunk(page, pos, to - from);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	ext2_commit_chunk(folio, pos, to - from);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(inode);
 	return ext2_handle_dirsync(inode);
@@ -617,21 +617,21 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
  */
 int ext2_make_empty(struct inode *inode, struct inode *parent)
 {
-	struct page *page = grab_cache_page(inode->i_mapping, 0);
+	struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
 	unsigned chunk_size = ext2_chunk_size(inode);
 	struct ext2_dir_entry_2 * de;
 	int err;
 	void *kaddr;
 
-	if (!page)
-		return -ENOMEM;
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
-	err = ext2_prepare_chunk(page, 0, chunk_size);
+	err = ext2_prepare_chunk(folio, 0, chunk_size);
 	if (err) {
-		unlock_page(page);
+		folio_unlock(folio);
 		goto fail;
 	}
-	kaddr = kmap_local_page(page);
+	kaddr = kmap_local_folio(folio, 0);
 	memset(kaddr, 0, chunk_size);
 	de = (struct ext2_dir_entry_2 *)kaddr;
 	de->name_len = 1;
@@ -647,26 +647,26 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
 	memcpy (de->name, "..\0", 4);
 	ext2_set_de_type (de, inode);
 	kunmap_local(kaddr);
-	ext2_commit_chunk(page, 0, chunk_size);
+	ext2_commit_chunk(folio, 0, chunk_size);
 	err = ext2_handle_dirsync(inode);
 fail:
-	put_page(page);
+	folio_put(folio);
 	return err;
 }
 
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-int ext2_empty_dir (struct inode * inode)
+int ext2_empty_dir(struct inode *inode)
 {
-	struct page *page;
+	struct folio *folio;
 	char *kaddr;
 	unsigned long i, npages = dir_pages(inode);
 
 	for (i = 0; i < npages; i++) {
 		ext2_dirent *de;
 
-		kaddr = ext2_get_page(inode, i, 0, &page);
+		kaddr = ext2_get_folio(inode, i, 0, &folio);
 		if (IS_ERR(kaddr))
 			return 0;
 
@@ -695,12 +695,12 @@ int ext2_empty_dir (struct inode * inode)
 			}
 			de = ext2_next_entry(de);
 		}
-		ext2_put_page(page, kaddr);
+		folio_release_kmap(folio, kaddr);
 	}
 	return 1;
 
 not_empty:
-	ext2_put_page(page, kaddr);
+	folio_release_kmap(folio, kaddr);
 	return 0;
 }
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 7fdd685c384d..677a9ad45dcb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -717,22 +717,17 @@ extern void ext2_init_block_alloc_info(struct inode *);
 extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
 
 /* dir.c */
-extern int ext2_add_link (struct dentry *, struct inode *);
-extern int ext2_inode_by_name(struct inode *dir,
+int ext2_add_link(struct dentry *, struct inode *);
+int ext2_inode_by_name(struct inode *dir,
 			      const struct qstr *child, ino_t *ino);
-extern int ext2_make_empty(struct inode *, struct inode *);
-extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
-						struct page **);
-extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page);
-extern int ext2_empty_dir (struct inode *);
-extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p);
+int ext2_make_empty(struct inode *, struct inode *);
+struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *,
+		struct folio **foliop);
+int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio);
+int ext2_empty_dir(struct inode *);
+struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop);
 int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-		struct page *page, struct inode *inode, bool update_times);
-static inline void ext2_put_page(struct page *page, void *page_addr)
-{
-	kunmap_local(page_addr);
-	put_page(page);
-}
+		struct folio *folio, struct inode *inode, bool update_times);
 
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1039e5bf90af..4ddc36f4dbd4 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -258,7 +258,6 @@ static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			goto out_unlock;
 		}
 
-		iocb->ki_pos += status;
 		ret += status;
 		endbyte = pos + status - 1;
 		ret2 = filemap_write_and_wait_range(inode->i_mapping, pos,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c24d0de95a83..fdf63e9c6e7c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -546,7 +546,7 @@ got:
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_flags =
 		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 314b415ee518..464faf6c217e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1291,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 	__ext2_truncate_blocks(inode, newsize);
 	filemap_invalidate_unlock(inode->i_mapping);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (inode_needs_sync(inode)) {
 		sync_mapping_buffers(inode->i_mapping);
 		sync_inode_metadata(inode, 1);
@@ -1412,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 	i_gid_write(inode, i_gid);
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le32_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+	inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0);
 	inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
-	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0);
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
 	 * This is needed because nfsd might try to access dead inodes
@@ -1544,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 	}
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le32(inode->i_size);
-	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
-	raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
-	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+	raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
+	raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
 
 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 059517068adc..65f702b1da5b 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -273,21 +273,21 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ext2_dir_entry_2 *de;
-	struct page *page;
+	struct folio *folio;
 	int err;
 
 	err = dquot_initialize(dir);
 	if (err)
 		goto out;
 
-	de = ext2_find_entry(dir, &dentry->d_name, &page);
+	de = ext2_find_entry(dir, &dentry->d_name, &folio);
 	if (IS_ERR(de)) {
 		err = PTR_ERR(de);
 		goto out;
 	}
 
-	err = ext2_delete_entry(de, page);
-	ext2_put_page(page, de);
+	err = ext2_delete_entry(de, folio);
+	folio_release_kmap(folio, de);
 	if (err)
 		goto out;
 
@@ -321,9 +321,9 @@ static int ext2_rename (struct mnt_idmap * idmap,
 {
 	struct inode * old_inode = d_inode(old_dentry);
 	struct inode * new_inode = d_inode(new_dentry);
-	struct page * dir_page = NULL;
+	struct folio *dir_folio = NULL;
 	struct ext2_dir_entry_2 * dir_de = NULL;
-	struct page * old_page;
+	struct folio * old_folio;
 	struct ext2_dir_entry_2 * old_de;
 	int err;
 
@@ -338,19 +338,19 @@ static int ext2_rename (struct mnt_idmap * idmap,
 	if (err)
 		return err;
 
-	old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page);
+	old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_folio);
 	if (IS_ERR(old_de))
 		return PTR_ERR(old_de);
 
 	if (S_ISDIR(old_inode->i_mode)) {
 		err = -EIO;
-		dir_de = ext2_dotdot(old_inode, &dir_page);
+		dir_de = ext2_dotdot(old_inode, &dir_folio);
 		if (!dir_de)
 			goto out_old;
 	}
 
 	if (new_inode) {
-		struct page *new_page;
+		struct folio *new_folio;
 		struct ext2_dir_entry_2 *new_de;
 
 		err = -ENOTEMPTY;
@@ -358,13 +358,13 @@ static int ext2_rename (struct mnt_idmap * idmap,
 			goto out_dir;
 
 		new_de = ext2_find_entry(new_dir, &new_dentry->d_name,
-					 &new_page);
+					 &new_folio);
 		if (IS_ERR(new_de)) {
 			err = PTR_ERR(new_de);
 			goto out_dir;
 		}
-		err = ext2_set_link(new_dir, new_de, new_page, old_inode, true);
-		ext2_put_page(new_page, new_de);
+		err = ext2_set_link(new_dir, new_de, new_folio, old_inode, true);
+		folio_release_kmap(new_folio, new_de);
 		if (err)
 			goto out_dir;
 		inode_set_ctime_current(new_inode);
@@ -386,19 +386,19 @@ static int ext2_rename (struct mnt_idmap * idmap,
 	inode_set_ctime_current(old_inode);
 	mark_inode_dirty(old_inode);
 
-	err = ext2_delete_entry(old_de, old_page);
+	err = ext2_delete_entry(old_de, old_folio);
 	if (!err && dir_de) {
 		if (old_dir != new_dir)
-			err = ext2_set_link(old_inode, dir_de, dir_page,
+			err = ext2_set_link(old_inode, dir_de, dir_folio,
 					    new_dir, false);
 
 		inode_dec_link_count(old_dir);
 	}
 out_dir:
 	if (dir_de)
-		ext2_put_page(dir_page, dir_de);
+		folio_release_kmap(dir_folio, dir_de);
 out_old:
-	ext2_put_page(old_page, old_de);
+	folio_release_kmap(old_folio, old_de);
 	return err;
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index aaf3e3e88cb2..01f9addc8b1f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 static const struct export_operations ext2_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ext2_fh_to_dentry,
 	.fh_to_parent = ext2_fh_to_parent,
 	.get_parent = ext2_get_parent,
@@ -1572,7 +1573,7 @@ out:
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return len - towrite;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 20f741184673..e849241ebb8f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *,
 static void ext2_xattr_rehash(struct ext2_xattr_header *,
 			      struct ext2_xattr_entry *);
 
-static const struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler * const ext2_xattr_handler_map[] = {
 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
@@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
 
-const struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler * const ext2_xattr_handlers[] = {
 	&ext2_xattr_user_handler,
 	&ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_SECURITY
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 7925f596e8e2..6a4966949047 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *);
 extern struct mb_cache *ext2_xattr_create_cache(void);
 extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
 
-extern const struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler * const ext2_xattr_handlers[];
 
 # else  /* CONFIG_EXT2_FS_XATTR */
 
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 0c5a79c3b5d4..ef4c19e5f570 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
+	/* usually, the umask is applied by posix_acl_create(), but if
+	   ext4 ACL support is disabled at compile time, we need to do
+	   it here, because posix_acl_create() will never be called */
+	inode->i_mode &= ~current_umask();
+
 	return 0;
 }
 #endif  /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 79b20d6ae39e..591fb3f710be 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -22,6 +22,7 @@
 #include "mballoc.h"
 
 #include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
 
 static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
 					    ext4_group_t block_group);
@@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb,
 	itbl_blk_start = ext4_inode_table(sb, gdp);
 	itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
 	if (itbl_blk_start <= end && itbl_blk_end >= start) {
-		itbl_blk_start = itbl_blk_start >= start ?
-			itbl_blk_start : start;
-		itbl_blk_end = itbl_blk_end <= end ?
-			itbl_blk_end : end;
+		itbl_blk_start = max(itbl_blk_start, start);
+		itbl_blk_end = min(itbl_blk_end, end);
 
 		itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
 		itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);
@@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct buffer_head *bh_p;
 
+	KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
+				   sb, block_group, bh);
+
 	if (block_group >= ngroups) {
 		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
 			   " groups_count = %u", block_group, ngroups);
@@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
 	ext4_fsblk_t bitmap_blk;
 	int err;
 
+	KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
+				   sb, block_group, ignore_locked);
+
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return ERR_PTR(-EFSCORRUPTED);
@@ -563,6 +568,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 {
 	struct ext4_group_desc *desc;
 
+	KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
+				   sb, block_group, bh);
+
 	if (!buffer_new(bh))
 		return 0;
 	desc = ext4_get_group_desc(sb, block_group, NULL);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..7ae0b61258a7 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -232,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
 	return ext4_has_feature_stable_inodes(sb);
 }
 
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
-	*lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
 const struct fscrypt_operations ext4_cryptops = {
-	.key_prefix		= "ext4:",
+	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
+	.legacy_key_prefix	= "ext4:",
 	.get_context		= ext4_get_context,
 	.set_context		= ext4_set_context,
 	.get_dummy_policy	= ext4_get_dummy_policy,
 	.empty_dir		= ext4_empty_dir,
 	.has_stable_inodes	= ext4_has_stable_inodes,
-	.get_ino_and_lblk_bits	= ext4_get_ino_and_lblk_bits,
 };
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9418359b1d9d..a5d784872303 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,10 +891,13 @@ do {										\
 		(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));	\
 } while (0)
 
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)				\
-	EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+#define EXT4_INODE_SET_ATIME(inode, raw_inode)						\
+	EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
 
-#define EXT4_INODE_SET_CTIME(inode, raw_inode)					\
+#define EXT4_INODE_SET_MTIME(inode, raw_inode)						\
+	EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode)						\
 	EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
 
 #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)				\
@@ -910,9 +913,16 @@ do {										\
 			.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)	\
 		})
 
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)				\
+#define EXT4_INODE_GET_ATIME(inode, raw_inode)					\
+do {										\
+	inode_set_atime_to_ts(inode,						\
+		EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));		\
+} while (0)
+
+#define EXT4_INODE_GET_MTIME(inode, raw_inode)					\
 do {										\
-	(inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode);	\
+	inode_set_mtime_to_ts(inode,						\
+		EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));		\
 } while (0)
 
 #define EXT4_INODE_GET_CTIME(inode, raw_inode)					\
@@ -1494,6 +1504,7 @@ struct ext4_sb_info {
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	/* Array of bh's for the block group descriptors */
 	struct buffer_head * __rcu *s_group_desc;
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
@@ -1537,7 +1548,7 @@ struct ext4_sb_info {
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
-	struct block_device *s_journal_bdev;
+	struct bdev_handle *s_journal_bdev_handle;
 #ifdef CONFIG_QUOTA
 	/* Names of quota files with journalled quota */
 	char __rcu *s_qf_names[EXT4_MAXQUOTAS];
@@ -1564,7 +1575,7 @@ struct ext4_sb_info {
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
 	unsigned int s_mb_free_pending;
-	struct list_head s_freed_data_list;	/* List of blocks to be freed
+	struct list_head s_freed_data_list[2];	/* List of blocks to be freed
 						   after commit completed */
 	struct list_head s_discard_list;
 	struct work_struct s_discard_work;
@@ -1653,7 +1664,7 @@ struct ext4_sb_info {
 	__u32 s_csum_seed;
 
 	/* Reclaim extents from extent status tree */
-	struct shrinker s_es_shrinker;
+	struct shrinker *s_es_shrinker;
 	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
@@ -1676,7 +1687,8 @@ struct ext4_sb_info {
 
 	/*
 	 * Barrier between writepages ops and changing any inode's JOURNAL_DATA
-	 * or EXTENTS flag.
+	 * or EXTENTS flag or between writepages ops and changing DELALLOC or
+	 * DIOREAD_NOLOCK mount options on remount.
 	 */
 	struct percpu_rw_semaphore s_writepages_rwsem;
 	struct dax_device *s_daxdev;
@@ -2924,7 +2936,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
 extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
-		       int len, int state);
+			    int len, bool state);
 static inline bool ext4_mb_cr_expensive(enum criteria cr)
 {
 	return cr >= CR_GOAL_LEN_SLOW;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 202c76996b62..d5efe076d3d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1010,6 +1010,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		ix = curp->p_idx;
 	}
 
+	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+		return -EFSCORRUPTED;
+	}
+
 	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
 	BUG_ON(len < 0);
 	if (len > 0) {
@@ -1019,11 +1024,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
 	}
 
-	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
-		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
-		return -EFSCORRUPTED;
-	}
-
 	ix->ei_block = cpu_to_le32(logical);
 	ext4_idx_store_pblock(ix, ptr);
 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -4481,7 +4481,8 @@ retry:
 			if (epos > new_size)
 				epos = new_size;
 			if (ext4_update_inode_size(inode, epos) & 0x1)
-				inode->i_mtime = inode_get_ctime(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_get_ctime(inode));
 		}
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 
 		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags);
@@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		goto out_mutex;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (new_size)
 		ext4_update_inode_size(inode, new_size);
 	ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ext4_mark_inode_dirty(handle, inode);
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 
@@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 	/* Expand file to avoid data loss if there is error while shifting */
 	inode->i_size += len;
 	EXT4_I(inode)->i_disksize += len;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_stop;
@@ -6080,13 +6081,13 @@ int ext4_ext_clear_bb(struct inode *inode)
 				for (j = 0; j < path->p_depth; j++) {
 
 					ext4_mb_mark_bb(inode->i_sb,
-							path[j].p_block, 1, 0);
+							path[j].p_block, 1, false);
 					ext4_fc_record_regions(inode->i_sb, inode->i_ino,
 							0, path[j].p_block, 1, 1);
 				}
 				ext4_free_ext_path(path);
 			}
-			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
 			ext4_fc_record_regions(inode->i_sb, inode->i_ino,
 					map.m_lblk, map.m_pblk, map.m_len, 1);
 		}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 6f7de14c0fa8..4a00e2f019d9 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 		       struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
-			     ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+			    ext4_lblk_t len,
+			    struct pending_reservation **prealloc);
 
 int __init ext4_init_es(void)
 {
@@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode)
 	spin_unlock(&sbi->s_es_lock);
 }
 
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+	if (!nofail)
+		return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+	return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+	kmem_cache_free(ext4_pending_cachep, pr);
+}
+
 /*
  * Returns true if we cannot fail to allocate memory for this extent_status
  * entry and cannot reclaim it until its status changes.
@@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 {
 	struct extent_status newes;
 	ext4_lblk_t end = lblk + len - 1;
-	int err1 = 0;
-	int err2 = 0;
+	int err1 = 0, err2 = 0, err3 = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct extent_status *es1 = NULL;
 	struct extent_status *es2 = NULL;
+	struct pending_reservation *pr = NULL;
+	bool revise_pending = false;
 
 	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 		return;
@@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 
 	ext4_es_insert_extent_check(inode, &newes);
 
+	revise_pending = sbi->s_cluster_ratio > 1 &&
+			 test_opt(inode->i_sb, DELALLOC) &&
+			 (status & (EXTENT_STATUS_WRITTEN |
+				    EXTENT_STATUS_UNWRITTEN));
 retry:
 	if (err1 && !es1)
 		es1 = __es_alloc_extent(true);
 	if ((err1 || err2) && !es2)
 		es2 = __es_alloc_extent(true);
+	if ((err1 || err2 || err3) && revise_pending && !pr)
+		pr = __alloc_pending(true);
 	write_lock(&EXT4_I(inode)->i_es_lock);
 
 	err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
@@ -897,13 +918,18 @@ retry:
 		es2 = NULL;
 	}
 
-	if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
-	    (status & EXTENT_STATUS_WRITTEN ||
-	     status & EXTENT_STATUS_UNWRITTEN))
-		__revise_pending(inode, lblk, len);
+	if (revise_pending) {
+		err3 = __revise_pending(inode, lblk, len, &pr);
+		if (err3 != 0)
+			goto error;
+		if (pr) {
+			__free_pending(pr);
+			pr = NULL;
+		}
+	}
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
-	if (err1 || err2)
+	if (err1 || err2 || err3)
 		goto retry;
 
 	ext4_es_print_tree(inode);
@@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
 				rc->ndelonly--;
 				node = rb_next(&pr->rb_node);
 				rb_erase(&pr->rb_node, &tree->root);
-				kmem_cache_free(ext4_pending_cachep, pr);
+				__free_pending(pr);
 				if (!node)
 					break;
 				pr = rb_entry(node, struct pending_reservation,
@@ -1405,8 +1431,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			}
 		}
 		if (count_reserved)
-			count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
-				   &orig_es, &rc);
+			count_rsvd(inode, orig_es.es_lblk + len1,
+				   orig_es.es_len - len1 - len2, &orig_es, &rc);
 		goto out_get_reserved;
 	}
 
@@ -1606,7 +1632,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
 	unsigned long nr;
 	struct ext4_sb_info *sbi;
 
-	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	sbi = shrink->private_data;
 	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
 	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
 	return nr;
@@ -1615,8 +1641,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
 static unsigned long ext4_es_scan(struct shrinker *shrink,
 				  struct shrink_control *sc)
 {
-	struct ext4_sb_info *sbi = container_of(shrink,
-					struct ext4_sb_info, s_es_shrinker);
+	struct ext4_sb_info *sbi = shrink->private_data;
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk;
 
@@ -1700,13 +1725,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	if (err)
 		goto err3;
 
-	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
-	sbi->s_es_shrinker.count_objects = ext4_es_count;
-	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-	err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
-				sbi->s_sb->s_id);
-	if (err)
+	sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+	if (!sbi->s_es_shrinker) {
+		err = -ENOMEM;
 		goto err4;
+	}
+
+	sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker->count_objects = ext4_es_count;
+	sbi->s_es_shrinker->private_data = sbi;
+
+	shrinker_register(sbi->s_es_shrinker);
 
 	return 0;
 err4:
@@ -1726,7 +1755,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-	unregister_shrinker(&sbi->s_es_shrinker);
+	shrinker_free(sbi->s_es_shrinker);
 }
 
 /*
@@ -1907,11 +1936,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
  *
  * @inode - file containing the cluster
  * @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
  *
  * Returns 0 on successful insertion and -ENOMEM on failure.  If the
  * pending reservation is already in the set, returns successfully.
  */
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+			    struct pending_reservation **prealloc)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1937,10 +1968,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
 		}
 	}
 
-	pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
-	if (pr == NULL) {
-		ret = -ENOMEM;
-		goto out;
+	if (likely(*prealloc == NULL)) {
+		pr = __alloc_pending(false);
+		if (!pr) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else {
+		pr = *prealloc;
+		*prealloc = NULL;
 	}
 	pr->lclu = lclu;
 
@@ -1970,7 +2006,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
 	if (pr != NULL) {
 		tree = &EXT4_I(inode)->i_pending_tree;
 		rb_erase(&pr->rb_node, &tree->root);
-		kmem_cache_free(ext4_pending_cachep, pr);
+		__free_pending(pr);
 	}
 }
 
@@ -2029,10 +2065,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
 				  bool allocated)
 {
 	struct extent_status newes;
-	int err1 = 0;
-	int err2 = 0;
+	int err1 = 0, err2 = 0, err3 = 0;
 	struct extent_status *es1 = NULL;
 	struct extent_status *es2 = NULL;
+	struct pending_reservation *pr = NULL;
 
 	if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
 		return;
@@ -2052,6 +2088,8 @@ retry:
 		es1 = __es_alloc_extent(true);
 	if ((err1 || err2) && !es2)
 		es2 = __es_alloc_extent(true);
+	if ((err1 || err2 || err3) && allocated && !pr)
+		pr = __alloc_pending(true);
 	write_lock(&EXT4_I(inode)->i_es_lock);
 
 	err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
@@ -2074,11 +2112,18 @@ retry:
 		es2 = NULL;
 	}
 
-	if (allocated)
-		__insert_pending(inode, lblk);
+	if (allocated) {
+		err3 = __insert_pending(inode, lblk, &pr);
+		if (err3 != 0)
+			goto error;
+		if (pr) {
+			__free_pending(pr);
+			pr = NULL;
+		}
+	}
 error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
-	if (err1 || err2)
+	if (err1 || err2 || err3)
 		goto retry;
 
 	ext4_es_print_tree(inode);
@@ -2184,21 +2229,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
  * @inode - file containing the range
  * @lblk - logical block defining the start of range
  * @len  - length of range in blocks
+ * @prealloc - preallocated pending entry
  *
  * Used after a newly allocated extent is added to the extents status tree.
  * Requires that the extents in the range have either written or unwritten
  * status.  Must be called while holding i_es_lock.
  */
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
-			     ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+			    ext4_lblk_t len,
+			    struct pending_reservation **prealloc)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_lblk_t end = lblk + len - 1;
 	ext4_lblk_t first, last;
 	bool f_del = false, l_del = false;
+	int ret = 0;
 
 	if (len == 0)
-		return;
+		return 0;
 
 	/*
 	 * Two cases - block range within single cluster and block range
@@ -2219,7 +2267,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
 						first, lblk - 1);
 		if (f_del) {
-			__insert_pending(inode, first);
+			ret = __insert_pending(inode, first, prealloc);
+			if (ret < 0)
+				goto out;
 		} else {
 			last = EXT4_LBLK_CMASK(sbi, end) +
 			       sbi->s_cluster_ratio - 1;
@@ -2227,9 +2277,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 				l_del = __es_scan_range(inode,
 							&ext4_es_is_delonly,
 							end + 1, last);
-			if (l_del)
-				__insert_pending(inode, last);
-			else
+			if (l_del) {
+				ret = __insert_pending(inode, last, prealloc);
+				if (ret < 0)
+					goto out;
+			} else
 				__remove_pending(inode, last);
 		}
 	} else {
@@ -2237,18 +2289,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
 		if (first != lblk)
 			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
 						first, lblk - 1);
-		if (f_del)
-			__insert_pending(inode, first);
-		else
+		if (f_del) {
+			ret = __insert_pending(inode, first, prealloc);
+			if (ret < 0)
+				goto out;
+		} else
 			__remove_pending(inode, first);
 
 		last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
 		if (last != end)
 			l_del = __es_scan_range(inode, &ext4_es_is_delonly,
 						end + 1, last);
-		if (l_del)
-			__insert_pending(inode, last);
-		else
+		if (l_del) {
+			ret = __insert_pending(inode, last, prealloc);
+			if (ret < 0)
+				goto out;
+		} else
 			__remove_pending(inode, last);
 	}
+out:
+	return ret;
 }
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b06de728b3b6..87c009e0c59a 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1806,7 +1806,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
 			 * at the end of the FC replay using our array of
 			 * modified inodes.
 			 */
-			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
 			goto next;
 		}
 
@@ -1875,7 +1875,7 @@ ext4_fc_replay_del_range(struct super_block *sb,
 		if (ret > 0) {
 			remaining -= ret;
 			cur += ret;
-			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
 		} else {
 			remaining -= map.m_len;
 			cur += map.m_len;
@@ -1934,12 +1934,12 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
 				if (!IS_ERR(path)) {
 					for (j = 0; j < path->p_depth; j++)
 						ext4_mb_mark_bb(inode->i_sb,
-							path[j].p_block, 1, 1);
+							path[j].p_block, 1, true);
 					ext4_free_ext_path(path);
 				}
 				cur += ret;
 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
-							map.m_len, 1);
+							map.m_len, true);
 			} else {
 				cur = cur + (map.m_len ? map.m_len : 1);
 			}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..6aa15dafc677 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,80 +306,38 @@ out:
 }
 
 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
-					   ssize_t written, size_t count)
+					   ssize_t count)
 {
 	handle_t *handle;
-	bool truncate = false;
-	u8 blkbits = inode->i_blkbits;
-	ext4_lblk_t written_blk, end_blk;
-	int ret;
-
-	/*
-	 * Note that EXT4_I(inode)->i_disksize can get extended up to
-	 * inode->i_size while the I/O was running due to writeback of delalloc
-	 * blocks. But, the code in ext4_iomap_alloc() is careful to use
-	 * zeroed/unwritten extents if this is possible; thus we won't leave
-	 * uninitialized blocks in a file even if we didn't succeed in writing
-	 * as much as we intended.
-	 */
-	WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
-	if (offset + count <= EXT4_I(inode)->i_disksize) {
-		/*
-		 * We need to ensure that the inode is removed from the orphan
-		 * list if it has been added prematurely, due to writeback of
-		 * delalloc blocks.
-		 */
-		if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
-			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
-			if (IS_ERR(handle)) {
-				ext4_orphan_del(NULL, inode);
-				return PTR_ERR(handle);
-			}
-
-			ext4_orphan_del(handle, inode);
-			ext4_journal_stop(handle);
-		}
-
-		return written;
-	}
-
-	if (written < 0)
-		goto truncate;
 
+	lockdep_assert_held_write(&inode->i_rwsem);
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-	if (IS_ERR(handle)) {
-		written = PTR_ERR(handle);
-		goto truncate;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
-	if (ext4_update_inode_size(inode, offset + written)) {
-		ret = ext4_mark_inode_dirty(handle, inode);
+	if (ext4_update_inode_size(inode, offset + count)) {
+		int ret = ext4_mark_inode_dirty(handle, inode);
 		if (unlikely(ret)) {
-			written = ret;
 			ext4_journal_stop(handle);
-			goto truncate;
+			return ret;
 		}
 	}
 
-	/*
-	 * We may need to truncate allocated but not written blocks beyond EOF.
-	 */
-	written_blk = ALIGN(offset + written, 1 << blkbits);
-	end_blk = ALIGN(offset + count, 1 << blkbits);
-	if (written_blk < end_blk && ext4_can_truncate(inode))
-		truncate = true;
-
-	/*
-	 * Remove the inode from the orphan list if it has been extended and
-	 * everything went OK.
-	 */
-	if (!truncate && inode->i_nlink)
+	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	ext4_journal_stop(handle);
 
-	if (truncate) {
-truncate:
+	return count;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+	if (count < 0) {
 		ext4_truncate_failed_write(inode);
 		/*
 		 * If the truncate operation failed early, then the inode may
@@ -388,9 +346,29 @@ truncate:
 		 */
 		if (inode->i_nlink)
 			ext4_orphan_del(NULL, inode);
+		return;
 	}
+	/*
+	 * If i_disksize got extended either due to writeback of delalloc
+	 * blocks or extending truncate while the DIO was running we could fail
+	 * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
+	 * now.
+	 */
+	if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+		handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 
-	return written;
+		if (IS_ERR(handle)) {
+			/*
+			 * The write has successfully completed. Not much to
+			 * do with the error here so just cleanup the orphan
+			 * list and hope for the best.
+			 */
+			ext4_orphan_del(NULL, inode);
+			return;
+		}
+		ext4_orphan_del(handle, inode);
+		ext4_journal_stop(handle);
+	}
 }
 
 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +377,23 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
 	loff_t pos = iocb->ki_pos;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
+	if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
 	if (error)
 		return error;
-
-	if (size && flags & IOMAP_DIO_UNWRITTEN) {
-		error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
-		if (error < 0)
-			return error;
-	}
 	/*
-	 * If we are extending the file, we have to update i_size here before
-	 * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
-	 * buffered reads could zero out too much from page cache pages. Update
-	 * of on-disk size will happen later in ext4_dio_write_iter() where
-	 * we have enough information to also perform orphan list handling etc.
-	 * Note that we perform all extending writes synchronously under
-	 * i_rwsem held exclusively so i_size update is safe here in that case.
-	 * If the write was not extending, we cannot see pos > i_size here
-	 * because operations reducing i_size like truncate wait for all
-	 * outstanding DIO before updating i_size.
+	 * Note that EXT4_I(inode)->i_disksize can get extended up to
+	 * inode->i_size while the I/O was running due to writeback of delalloc
+	 * blocks. But the code in ext4_iomap_alloc() is careful to use
+	 * zeroed/unwritten extents if this is possible; thus we won't leave
+	 * uninitialized blocks in a file even if we didn't succeed in writing
+	 * as much as we intended. Also we can race with truncate or write
+	 * expanding the file so we have to be a bit careful here.
 	 */
-	pos += size;
-	if (pos > i_size_read(inode))
-		i_size_write(inode, pos);
-
-	return 0;
+	if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
+	    pos + size <= i_size_read(inode))
+		return size;
+	return ext4_handle_inode_extension(inode, pos, size);
 }
 
 static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -569,18 +539,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return ext4_buffered_write_iter(iocb, from);
 	}
 
+	/*
+	 * Prevent inline data from being created since we are going to allocate
+	 * blocks for DIO. We know the inode does not currently have inline data
+	 * because ext4_should_use_dio() checked for it, but we have to clear
+	 * the state flag before the write checks because a lock cycle could
+	 * introduce races with other writers.
+	 */
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
 	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
 				    &unwritten, &dio_flags);
 	if (ret <= 0)
 		return ret;
 
-	/*
-	 * Make sure inline data cannot be created anymore since we are going
-	 * to allocate blocks for DIO. We know the inode does not have any
-	 * inline data now because ext4_dio_supported() checked for that.
-	 */
-	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
 	offset = iocb->ki_pos;
 	count = ret;
 
@@ -606,9 +578,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			   dio_flags, NULL, 0);
 	if (ret == -ENOTBLK)
 		ret = 0;
-
-	if (extend)
-		ret = ext4_handle_inode_extension(inode, offset, ret, count);
+	if (extend) {
+		/*
+		 * We always perform extending DIO write synchronously so by
+		 * now the IO is completed and ext4_handle_inode_extension()
+		 * was called. Cleanup the inode in case of error or race with
+		 * writeback of delalloc blocks.
+		 */
+		WARN_ON_ONCE(ret == -EIOCBQUEUED);
+		ext4_inode_extension_cleanup(inode, ret);
+	}
 
 out:
 	if (ilock_shared)
@@ -689,8 +668,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 
-	if (extend)
-		ret = ext4_handle_inode_extension(inode, offset, ret, count);
+	if (extend) {
+		ret = ext4_handle_inode_extension(inode, offset, ret);
+		ext4_inode_extension_cleanup(inode, ret);
+	}
 out:
 	inode_unlock(inode);
 	if (ret > 0)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index cdf9bfe10137..11e6f33677a2 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
 	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
 	    fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
 		return true;
-	if (EXT4_SB(sb)->s_journal_bdev &&
-	    fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
+	if (EXT4_SB(sb)->s_journal_bdev_handle &&
+	    fm->fmr_device ==
+	    new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
 		return true;
 	return false;
 }
@@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
 	memset(handlers, 0, sizeof(handlers));
 	handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
 	handlers[0].gfd_fn = ext4_getfsmap_datadev;
-	if (EXT4_SB(sb)->s_journal_bdev) {
+	if (EXT4_SB(sb)->s_journal_bdev_handle) {
 		handlers[1].gfd_dev = new_encode_dev(
-				EXT4_SB(sb)->s_journal_bdev->bd_dev);
+			EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
 		handlers[1].gfd_fn = ext4_getfsmap_logdev;
 	}
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..e9bbb1da2d0a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1250,8 +1250,8 @@ got:
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	ei->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	ei->i_crtime = inode_get_mtime(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 012d9259ff53..9a84a5f9fef4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	ext4_update_dx_flag(dir);
 	inode_inc_iversion(dir);
 	return 1;
@@ -1991,7 +1991,7 @@ out:
 		ext4_orphan_del(handle, inode);
 
 	if (err == 0) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		err = ext4_mark_inode_dirty(handle, inode);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4ce35f1c8b0a..61277f7f8722 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -789,10 +789,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 			     struct buffer_head *bh_result, int create)
 {
+	int ret = 0;
+
 	ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result,
+	ret = _ext4_get_block(inode, iblock, bh_result,
 			       EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+
+	/*
+	 * If the buffer is marked unwritten, mark it as new to make sure it is
+	 * zeroed out correctly in case of partial writes. Otherwise, there is
+	 * a chance of stale data getting exposed.
+	 */
+	if (ret == 0 && buffer_unwritten(bh_result))
+		set_buffer_new(bh_result);
+
+	return ret;
 }
 
 /* Maximum number of blocks we map for direct IO at once. */
@@ -1020,10 +1032,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 	BUG_ON(from > to);
 
 	head = folio_buffers(folio);
-	if (!head) {
-		create_empty_buffers(&folio->page, blocksize, 0);
-		head = folio_buffers(folio);
-	}
+	if (!head)
+		head = create_empty_buffers(folio, blocksize, 0);
 	bbits = ilog2(blocksize);
 	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
 
@@ -1153,7 +1163,7 @@ retry_grab:
 	 * starting the handle.
 	 */
 	if (!folio_buffers(folio))
-		create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0);
+		create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
 
 	folio_unlock(folio);
 
@@ -3643,10 +3653,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	bh = folio_buffers(folio);
-	if (!bh) {
-		create_empty_buffers(&folio->page, blocksize, 0);
-		bh = folio_buffers(folio);
-	}
+	if (!bh)
+		bh = create_empty_buffers(folio, blocksize, 0);
 
 	/* Find the buffer that contains "offset" */
 	pos = blocksize;
@@ -4020,7 +4028,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret2 = ext4_mark_inode_dirty(handle, inode);
 	if (unlikely(ret2))
 		ret = ret2;
@@ -4180,7 +4188,7 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	err2 = ext4_mark_inode_dirty(handle, inode);
 	if (unlikely(err2 && !err))
 		err = err2;
@@ -4284,8 +4292,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 
 	EXT4_INODE_SET_CTIME(inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
-	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+	EXT4_INODE_SET_MTIME(inode, raw_inode);
+	EXT4_INODE_SET_ATIME(inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
@@ -4893,8 +4901,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	}
 
 	EXT4_INODE_GET_CTIME(inode, raw_inode);
-	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
-	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+	EXT4_INODE_GET_ATIME(inode, raw_inode);
+	EXT4_INODE_GET_MTIME(inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
 	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -5019,8 +5027,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
 
 		spin_lock(&ei->i_raw_lock);
 		EXT4_INODE_SET_CTIME(inode, raw_inode);
-		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
-		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+		EXT4_INODE_SET_MTIME(inode, raw_inode);
+		EXT4_INODE_SET_ATIME(inode, raw_inode);
 		ext4_inode_csum_set(inode, raw_inode, ei);
 		spin_unlock(&ei->i_raw_lock);
 		trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -5413,7 +5421,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			 * update c/mtime in shrink case below
 			 */
 			if (!shrink)
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 
 			if (shrink)
 				ext4_fc_track_range(handle, inode,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0bfe2ce589e2..4f931f80cb34 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	struct ext4_inode_info *ei1;
 	struct ext4_inode_info *ei2;
 	unsigned long tmp;
+	struct timespec64 ts1, ts2;
 
 	ei1 = EXT4_I(inode1);
 	ei2 = EXT4_I(inode2);
 
 	swap(inode1->i_version, inode2->i_version);
-	swap(inode1->i_atime, inode2->i_atime);
-	swap(inode1->i_mtime, inode2->i_mtime);
+
+	ts1 = inode_get_atime(inode1);
+	ts2 = inode_get_atime(inode2);
+	inode_set_atime_to_ts(inode1, ts2);
+	inode_set_atime_to_ts(inode2, ts1);
+
+	ts1 = inode_get_mtime(inode1);
+	ts2 = inode_get_mtime(inode2);
+	inode_set_mtime_to_ts(inode1, ts2);
+	inode_set_mtime_to_ts(inode2, ts1);
 
 	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
 	tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
new file mode 100644
index 000000000000..f94901fd3835
--- /dev/null
+++ b/fs/ext4/mballoc-test.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 multiblocks allocation.
+ */
+
+#include <kunit/test.h>
+#include <kunit/static_stub.h>
+
+#include "ext4.h"
+
+struct mbt_grp_ctx {
+	struct buffer_head bitmap_bh;
+	/* desc and gd_bh are just the place holders for now */
+	struct ext4_group_desc desc;
+	struct buffer_head gd_bh;
+};
+
+struct mbt_ctx {
+	struct mbt_grp_ctx *grp_ctx;
+};
+
+struct mbt_ext4_super_block {
+	struct super_block sb;
+	struct mbt_ctx mbt_ctx;
+};
+
+#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
+
+static struct super_block *mbt_ext4_alloc_super_block(void)
+{
+	struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
+	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+
+	if (fsb == NULL || sbi == NULL || es == NULL)
+		goto out;
+
+	sbi->s_es = es;
+	fsb->sb.s_fs_info = sbi;
+	return &fsb->sb;
+
+out:
+	kfree(fsb);
+	kfree(sbi);
+	kfree(es);
+	return NULL;
+}
+
+static void mbt_ext4_free_super_block(struct super_block *sb)
+{
+	struct mbt_ext4_super_block *fsb =
+		container_of(sb, struct mbt_ext4_super_block, sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	kfree(sbi->s_es);
+	kfree(sbi);
+	kfree(fsb);
+}
+
+struct mbt_ext4_block_layout {
+	unsigned char blocksize_bits;
+	unsigned int cluster_bits;
+	uint32_t blocks_per_group;
+	ext4_group_t group_count;
+	uint16_t desc_size;
+};
+
+static void mbt_init_sb_layout(struct super_block *sb,
+			       struct mbt_ext4_block_layout *layout)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+
+	sb->s_blocksize = 1UL << layout->blocksize_bits;
+	sb->s_blocksize_bits = layout->blocksize_bits;
+
+	sbi->s_groups_count = layout->group_count;
+	sbi->s_blocks_per_group = layout->blocks_per_group;
+	sbi->s_cluster_bits = layout->cluster_bits;
+	sbi->s_cluster_ratio = 1U << layout->cluster_bits;
+	sbi->s_clusters_per_group = layout->blocks_per_group >>
+				    layout->cluster_bits;
+	sbi->s_desc_size = layout->desc_size;
+
+	es->s_first_data_block = cpu_to_le32(0);
+	es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
+					    layout->group_count);
+}
+
+static int mbt_grp_ctx_init(struct super_block *sb,
+			    struct mbt_grp_ctx *grp_ctx)
+{
+	grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
+	if (grp_ctx->bitmap_bh.b_data == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx)
+{
+	kfree(grp_ctx->bitmap_bh.b_data);
+	grp_ctx->bitmap_bh.b_data = NULL;
+}
+
+static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
+			      unsigned int start, unsigned int len)
+{
+	struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+	mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
+}
+
+/* called after mbt_init_sb_layout */
+static int mbt_ctx_init(struct super_block *sb)
+{
+	struct mbt_ctx *ctx = MBT_CTX(sb);
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+	ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx),
+			       GFP_KERNEL);
+	if (ctx->grp_ctx == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ngroups; i++)
+		if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i]))
+			goto out;
+
+	/*
+	 * first data block(first cluster in first group) is used by
+	 * metadata, mark it used to avoid to alloc data block at first
+	 * block which will fail ext4_sb_block_valid check.
+	 */
+	mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+
+	return 0;
+out:
+	while (i-- > 0)
+		mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+	kfree(ctx->grp_ctx);
+	return -ENOMEM;
+}
+
+static void mbt_ctx_release(struct super_block *sb)
+{
+	struct mbt_ctx *ctx = MBT_CTX(sb);
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+	for (i = 0; i < ngroups; i++)
+		mbt_grp_ctx_release(&ctx->grp_ctx[i]);
+	kfree(ctx->grp_ctx);
+}
+
+static struct buffer_head *
+ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group,
+				   bool ignore_locked)
+{
+	struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+	/* paired with brelse from caller of ext4_read_block_bitmap_nowait */
+	get_bh(&grp_ctx->bitmap_bh);
+	return &grp_ctx->bitmap_bh;
+}
+
+static int ext4_wait_block_bitmap_stub(struct super_block *sb,
+				       ext4_group_t block_group,
+				       struct buffer_head *bh)
+{
+	return 0;
+}
+
+static struct ext4_group_desc *
+ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group,
+			 struct buffer_head **bh)
+{
+	struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group);
+
+	if (bh != NULL)
+		*bh = &grp_ctx->gd_bh;
+
+	return &grp_ctx->desc;
+}
+
+static int
+ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state,
+			  ext4_group_t group, ext4_grpblk_t blkoff,
+			  ext4_grpblk_t len, int flags,
+			  ext4_grpblk_t *ret_changed)
+{
+	struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+	struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh;
+
+	if (state)
+		mb_set_bits(bitmap_bh->b_data, blkoff, len);
+	else
+		mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+
+	return 0;
+}
+
+#define TEST_GOAL_GROUP 1
+static int mbt_kunit_init(struct kunit *test)
+{
+	struct mbt_ext4_block_layout *layout =
+		(struct mbt_ext4_block_layout *)(test->param_value);
+	struct super_block *sb;
+	int ret;
+
+	sb = mbt_ext4_alloc_super_block();
+	if (sb == NULL)
+		return -ENOMEM;
+
+	mbt_init_sb_layout(sb, layout);
+
+	ret = mbt_ctx_init(sb);
+	if (ret != 0) {
+		mbt_ext4_free_super_block(sb);
+		return ret;
+	}
+
+	test->priv = sb;
+	kunit_activate_static_stub(test,
+				   ext4_read_block_bitmap_nowait,
+				   ext4_read_block_bitmap_nowait_stub);
+	kunit_activate_static_stub(test,
+				   ext4_wait_block_bitmap,
+				   ext4_wait_block_bitmap_stub);
+	kunit_activate_static_stub(test,
+				   ext4_get_group_desc,
+				   ext4_get_group_desc_stub);
+	kunit_activate_static_stub(test,
+				   ext4_mb_mark_context,
+				   ext4_mb_mark_context_stub);
+	return 0;
+}
+
+static void mbt_kunit_exit(struct kunit *test)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+
+	mbt_ctx_release(sb);
+	mbt_ext4_free_super_block(sb);
+}
+
+static void test_new_blocks_simple(struct kunit *test)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	struct inode inode = { .i_sb = sb, };
+	struct ext4_allocation_request ar;
+	ext4_group_t i, goal_group = TEST_GOAL_GROUP;
+	int err = 0;
+	ext4_fsblk_t found;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	ar.inode = &inode;
+
+	/* get block at goal */
+	ar.goal = ext4_group_first_block_no(sb, goal_group);
+	found = ext4_mb_new_blocks_simple(&ar, &err);
+	KUNIT_ASSERT_EQ_MSG(test, ar.goal, found,
+		"failed to alloc block at goal, expected %llu found %llu",
+		ar.goal, found);
+
+	/* get block after goal in goal group */
+	ar.goal = ext4_group_first_block_no(sb, goal_group);
+	found = ext4_mb_new_blocks_simple(&ar, &err);
+	KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found,
+		"failed to alloc block after goal in goal group, expected %llu found %llu",
+		ar.goal + 1, found);
+
+	/* get block after goal group */
+	mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+	ar.goal = ext4_group_first_block_no(sb, goal_group);
+	found = ext4_mb_new_blocks_simple(&ar, &err);
+	KUNIT_ASSERT_EQ_MSG(test,
+		ext4_group_first_block_no(sb, goal_group + 1), found,
+		"failed to alloc block after goal group, expected %llu found %llu",
+		ext4_group_first_block_no(sb, goal_group + 1), found);
+
+	/* get block before goal group */
+	for (i = goal_group; i < ext4_get_groups_count(sb); i++)
+		mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+	ar.goal = ext4_group_first_block_no(sb, goal_group);
+	found = ext4_mb_new_blocks_simple(&ar, &err);
+	KUNIT_ASSERT_EQ_MSG(test,
+		ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found,
+		"failed to alloc block before goal group, expected %llu found %llu",
+		ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found);
+
+	/* no block available, fail to allocate block */
+	for (i = 0; i < ext4_get_groups_count(sb); i++)
+		mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+	ar.goal = ext4_group_first_block_no(sb, goal_group);
+	found = ext4_mb_new_blocks_simple(&ar, &err);
+	KUNIT_ASSERT_NE_MSG(test, err, 0,
+		"unexpectedly get block when no block is available");
+}
+
+static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
+	{
+		.blocksize_bits = 10,
+		.cluster_bits = 3,
+		.blocks_per_group = 8192,
+		.group_count = 4,
+		.desc_size = 64,
+	},
+	{
+		.blocksize_bits = 12,
+		.cluster_bits = 3,
+		.blocks_per_group = 8192,
+		.group_count = 4,
+		.desc_size = 64,
+	},
+	{
+		.blocksize_bits = 16,
+		.cluster_bits = 3,
+		.blocks_per_group = 8192,
+		.group_count = 4,
+		.desc_size = 64,
+	},
+};
+
+static void mbt_show_layout(const struct mbt_ext4_block_layout *layout,
+			    char *desc)
+{
+	snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d "
+		 "blocks_per_group=%d group_count=%d desc_size=%d\n",
+		 layout->blocksize_bits, layout->cluster_bits,
+		 layout->blocks_per_group, layout->group_count,
+		 layout->desc_size);
+}
+KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
+
+static struct kunit_case mbt_test_cases[] = {
+	KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+	{}
+};
+
+static struct kunit_suite mbt_test_suite = {
+	.name = "ext4_mballoc_test",
+	.init = mbt_kunit_init,
+	.exit = mbt_kunit_exit,
+	.test_cases = mbt_test_cases,
+};
+
+kunit_test_suites(&mbt_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1e599305d85f..d72b5e3c92ec 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -18,6 +18,7 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
 #include <trace/events/ext4.h>
+#include <kunit/static_stub.h>
 
 /*
  * MUSTDO:
@@ -417,8 +418,6 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
-						ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
 
 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
@@ -1361,17 +1360,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 		 * We place the buddy block and bitmap block
 		 * close together
 		 */
+		grinfo = ext4_get_group_info(sb, group);
+		if (!grinfo) {
+			err = -EFSCORRUPTED;
+		        goto out;
+		}
 		if ((first_block + i) & 1) {
 			/* this is block of buddy */
 			BUG_ON(incore == NULL);
 			mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
 			trace_ext4_mb_buddy_bitmap_load(sb, group);
-			grinfo = ext4_get_group_info(sb, group);
-			if (!grinfo) {
-				err = -EFSCORRUPTED;
-				goto out;
-			}
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
 			       sizeof(*grinfo->bb_counters) *
@@ -1398,7 +1397,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 
 			/* mark all preallocated blks used in in-core bitmap */
 			ext4_mb_generate_from_pa(sb, data, group);
-			ext4_mb_generate_from_freelist(sb, data, group);
+			WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
 			ext4_unlock_group(sb, group);
 
 			/* set incore so that the buddy information can be
@@ -3631,7 +3630,8 @@ int ext4_mb_init(struct super_block *sb)
 
 	spin_lock_init(&sbi->s_md_lock);
 	sbi->s_mb_free_pending = 0;
-	INIT_LIST_HEAD(&sbi->s_freed_data_list);
+	INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
+	INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
 	INIT_LIST_HEAD(&sbi->s_discard_list);
 	INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
 	atomic_set(&sbi->s_retry_alloc_pending, 0);
@@ -3883,19 +3883,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_free_data *entry, *tmp;
 	LIST_HEAD(freed_data_list);
-	struct list_head *cut_pos = NULL;
+	struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
 	bool wake;
 
-	spin_lock(&sbi->s_md_lock);
-	list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
-		if (entry->efd_tid != commit_tid)
-			break;
-		cut_pos = &entry->efd_list;
-	}
-	if (cut_pos)
-		list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
-				  cut_pos);
-	spin_unlock(&sbi->s_md_lock);
+	list_replace_init(s_freed_head, &freed_data_list);
 
 	list_for_each_entry(entry, &freed_data_list, efd_list)
 		ext4_free_data_in_buddy(sb, entry);
@@ -3953,6 +3944,111 @@ void ext4_exit_mballoc(void)
 	ext4_groupinfo_destroy_slabs();
 }
 
+#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
+#define EXT4_MB_SYNC_UPDATE 0x0002
+static int
+ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
+		     ext4_group_t group, ext4_grpblk_t blkoff,
+		     ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
+	int err;
+	unsigned int i, already, changed = len;
+
+	KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
+				   handle, sb, state, group, blkoff, len,
+				   flags, ret_changed);
+
+	if (ret_changed)
+		*ret_changed = 0;
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
+	if (IS_ERR(bitmap_bh))
+		return PTR_ERR(bitmap_bh);
+
+	if (handle) {
+		BUFFER_TRACE(bitmap_bh, "getting write access");
+		err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
+						    EXT4_JTR_NONE);
+		if (err)
+			goto out_err;
+	}
+
+	err = -EIO;
+	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp)
+		goto out_err;
+
+	if (handle) {
+		BUFFER_TRACE(gdp_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, sb, gdp_bh,
+						    EXT4_JTR_NONE);
+		if (err)
+			goto out_err;
+	}
+
+	ext4_lock_group(sb, group);
+	if (ext4_has_group_desc_csum(sb) &&
+	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+		ext4_free_group_clusters_set(sb, gdp,
+			ext4_free_clusters_after_init(sb, group, gdp));
+	}
+
+	if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
+		already = 0;
+		for (i = 0; i < len; i++)
+			if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+					state)
+				already++;
+		changed = len - already;
+	}
+
+	if (state) {
+		mb_set_bits(bitmap_bh->b_data, blkoff, len);
+		ext4_free_group_clusters_set(sb, gdp,
+			ext4_free_group_clusters(sb, gdp) - changed);
+	} else {
+		mb_clear_bits(bitmap_bh->b_data, blkoff, len);
+		ext4_free_group_clusters_set(sb, gdp,
+			ext4_free_group_clusters(sb, gdp) + changed);
+	}
+
+	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, group, gdp);
+	ext4_unlock_group(sb, group);
+	if (ret_changed)
+		*ret_changed = changed;
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group);
+		struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+					   s_flex_groups, flex_group);
+
+		if (state)
+			atomic64_sub(changed, &fg->free_clusters);
+		else
+			atomic64_add(changed, &fg->free_clusters);
+	}
+
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+	if (err)
+		goto out_err;
+
+	if (flags & EXT4_MB_SYNC_UPDATE) {
+		sync_dirty_buffer(bitmap_bh);
+		sync_dirty_buffer(gdp_bh);
+	}
+
+out_err:
+	brelse(bitmap_bh);
+	return err;
+}
 
 /*
  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
@@ -3962,13 +4058,13 @@ static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 				handle_t *handle, unsigned int reserv_clstrs)
 {
-	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_group_desc *gdp;
-	struct buffer_head *gdp_bh;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block;
 	int err, len;
+	int flags = 0;
+	ext4_grpblk_t changed;
 
 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
 	BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -3976,32 +4072,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
 
-	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
-	if (IS_ERR(bitmap_bh)) {
-		return PTR_ERR(bitmap_bh);
-	}
-
-	BUFFER_TRACE(bitmap_bh, "getting write access");
-	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-					    EXT4_JTR_NONE);
-	if (err)
-		goto out_err;
-
-	err = -EIO;
-	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
 	if (!gdp)
-		goto out_err;
-
+		return -EIO;
 	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
 			ext4_free_group_clusters(sb, gdp));
 
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
-	if (err)
-		goto out_err;
-
 	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-
 	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 	if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
 		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
@@ -4010,41 +4087,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		 * Fix the bitmap and return EFSCORRUPTED
 		 * We leak some of the blocks here.
 		 */
-		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-			      ac->ac_b_ex.fe_len);
-		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		err = ext4_mb_mark_context(handle, sb, true,
+					   ac->ac_b_ex.fe_group,
+					   ac->ac_b_ex.fe_start,
+					   ac->ac_b_ex.fe_len,
+					   0, NULL);
 		if (!err)
 			err = -EFSCORRUPTED;
-		goto out_err;
+		return err;
 	}
 
-	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
-	{
-		int i;
-		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
-			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
-						bitmap_bh->b_data));
-		}
-	}
+	flags |= EXT4_MB_BITMAP_MARKED_CHECK;
 #endif
-	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-		      ac->ac_b_ex.fe_len);
-	if (ext4_has_group_desc_csum(sb) &&
-	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
-		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-		ext4_free_group_clusters_set(sb, gdp,
-					     ext4_free_clusters_after_init(sb,
-						ac->ac_b_ex.fe_group, gdp));
-	}
-	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
-	ext4_free_group_clusters_set(sb, gdp, len);
-	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+	err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
+				   ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
+				   flags, &changed);
+
+	if (err && changed == 0)
+		return err;
 
-	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+	BUG_ON(changed != ac->ac_b_ex.fe_len);
+#endif
 	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
 	/*
 	 * Now reduce the dirty block count also. Should not go negative
@@ -4054,21 +4119,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 				   reserv_clstrs);
 
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi,
-							  ac->ac_b_ex.fe_group);
-		atomic64_sub(ac->ac_b_ex.fe_len,
-			     &sbi_array_rcu_deref(sbi, s_flex_groups,
-						  flex_group)->free_clusters);
-	}
-
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-	if (err)
-		goto out_err;
-	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
-
-out_err:
-	brelse(bitmap_bh);
 	return err;
 }
 
@@ -4077,17 +4127,13 @@ out_err:
  * blocks in bitmaps and update counters.
  */
 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
-			int len, int state)
+		     int len, bool state)
 {
-	struct buffer_head *bitmap_bh = NULL;
-	struct ext4_group_desc *gdp;
-	struct buffer_head *gdp_bh;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 	ext4_grpblk_t blkoff;
-	int i, err = 0;
-	int already;
-	unsigned int clen, clen_changed, thisgrp_len;
+	int err = 0;
+	unsigned int clen, thisgrp_len;
 
 	while (len > 0) {
 		ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
@@ -4108,80 +4154,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
 			ext4_error(sb, "Marking blocks in system zone - "
 				   "Block = %llu, len = %u",
 				   block, thisgrp_len);
-			bitmap_bh = NULL;
-			break;
-		}
-
-		bitmap_bh = ext4_read_block_bitmap(sb, group);
-		if (IS_ERR(bitmap_bh)) {
-			err = PTR_ERR(bitmap_bh);
-			bitmap_bh = NULL;
-			break;
-		}
-
-		err = -EIO;
-		gdp = ext4_get_group_desc(sb, group, &gdp_bh);
-		if (!gdp)
 			break;
-
-		ext4_lock_group(sb, group);
-		already = 0;
-		for (i = 0; i < clen; i++)
-			if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
-					 !state)
-				already++;
-
-		clen_changed = clen - already;
-		if (state)
-			mb_set_bits(bitmap_bh->b_data, blkoff, clen);
-		else
-			mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
-		if (ext4_has_group_desc_csum(sb) &&
-		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-			ext4_free_group_clusters_set(sb, gdp,
-			     ext4_free_clusters_after_init(sb, group, gdp));
 		}
-		if (state)
-			clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
-		else
-			clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
-
-		ext4_free_group_clusters_set(sb, gdp, clen);
-		ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-		ext4_group_desc_csum_set(sb, group, gdp);
-
-		ext4_unlock_group(sb, group);
 
-		if (sbi->s_log_groups_per_flex) {
-			ext4_group_t flex_group = ext4_flex_group(sbi, group);
-			struct flex_groups *fg = sbi_array_rcu_deref(sbi,
-						   s_flex_groups, flex_group);
-
-			if (state)
-				atomic64_sub(clen_changed, &fg->free_clusters);
-			else
-				atomic64_add(clen_changed, &fg->free_clusters);
-
-		}
-
-		err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
-		if (err)
-			break;
-		sync_dirty_buffer(bitmap_bh);
-		err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
-		sync_dirty_buffer(gdp_bh);
+		err = ext4_mb_mark_context(NULL, sb, state,
+					   group, blkoff, clen,
+					   EXT4_MB_BITMAP_MARKED_CHECK |
+					   EXT4_MB_SYNC_UPDATE,
+					   NULL);
 		if (err)
 			break;
 
 		block += thisgrp_len;
 		len -= thisgrp_len;
-		brelse(bitmap_bh);
 		BUG_ON(len < 0);
 	}
-
-	if (err)
-		brelse(bitmap_bh);
 }
 
 /*
@@ -4491,6 +4478,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	start = max(start, rounddown(ac->ac_o_ex.fe_logical,
 			(ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
 
+	/* avoid unnecessary preallocation that may trigger assertions */
+	if (start + size > EXT_MAX_BLOCKS)
+		size = EXT_MAX_BLOCKS - start;
+
 	/* don't cover already allocated blocks in selected range */
 	if (ar->pleft && start <= ar->lleft) {
 		size -= ar->lleft + 1 - start;
@@ -4959,31 +4950,6 @@ try_group_pa:
 }
 
 /*
- * the function goes through all block freed in the group
- * but not yet committed and marks them used in in-core bitmap.
- * buddy must be generated from this bitmap
- * Need to be called with the ext4 group lock held
- */
-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
-						ext4_group_t group)
-{
-	struct rb_node *n;
-	struct ext4_group_info *grp;
-	struct ext4_free_data *entry;
-
-	grp = ext4_get_group_info(sb, group);
-	if (!grp)
-		return;
-	n = rb_first(&(grp->bb_free_root));
-
-	while (n) {
-		entry = rb_entry(n, struct ext4_free_data, efd_node);
-		mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
-		n = rb_next(n);
-	}
-}
-
-/*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
  * Need to be called with ext4 group lock held
@@ -6130,7 +6096,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
 	}
 
 	block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
-	ext4_mb_mark_bb(sb, block, 1, 1);
+	ext4_mb_mark_bb(sb, block, 1, true);
 	ar->len = 1;
 
 	return block;
@@ -6378,7 +6344,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	}
 
 	spin_lock(&sbi->s_md_lock);
-	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
+	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
 	sbi->s_mb_free_pending += clusters;
 	spin_unlock(&sbi->s_md_lock);
 }
@@ -6386,43 +6352,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
 					unsigned long count)
 {
-	struct buffer_head *bitmap_bh;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_group_desc *gdp;
-	struct buffer_head *gdp_bh;
 	ext4_group_t group;
 	ext4_grpblk_t blkoff;
-	int already_freed = 0, err, i;
 
 	ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
-	bitmap_bh = ext4_read_block_bitmap(sb, group);
-	if (IS_ERR(bitmap_bh)) {
-		pr_warn("Failed to read block bitmap\n");
-		return;
-	}
-	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
-	if (!gdp)
-		goto err_out;
-
-	for (i = 0; i < count; i++) {
-		if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
-			already_freed++;
-	}
-	mb_clear_bits(bitmap_bh->b_data, blkoff, count);
-	err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
-	if (err)
-		goto err_out;
-	ext4_free_group_clusters_set(
-		sb, gdp, ext4_free_group_clusters(sb, gdp) +
-		count - already_freed);
-	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-	ext4_group_desc_csum_set(sb, group, gdp);
-	ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
-	sync_dirty_buffer(bitmap_bh);
-	sync_dirty_buffer(gdp_bh);
-
-err_out:
-	brelse(bitmap_bh);
+	ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
+			     EXT4_MB_BITMAP_MARKED_CHECK |
+			     EXT4_MB_SYNC_UPDATE,
+			     NULL);
 }
 
 /**
@@ -6438,19 +6376,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 			       ext4_fsblk_t block, unsigned long count,
 			       int flags)
 {
-	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_group_desc *gdp;
 	struct ext4_group_info *grp;
 	unsigned int overflow;
 	ext4_grpblk_t bit;
-	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	struct ext4_sb_info *sbi;
 	struct ext4_buddy e4b;
 	unsigned int count_clusters;
 	int err = 0;
-	int ret;
+	int mark_flags = 0;
+	ext4_grpblk_t changed;
 
 	sbi = EXT4_SB(sb);
 
@@ -6459,7 +6395,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
-		goto error_return;
+		goto error_out;
 	}
 	flags |= EXT4_FREE_BLOCKS_VALIDATED;
 
@@ -6483,55 +6419,35 @@ do_more:
 		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 	}
 	count_clusters = EXT4_NUM_B2C(sbi, count);
-	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (IS_ERR(bitmap_bh)) {
-		err = PTR_ERR(bitmap_bh);
-		bitmap_bh = NULL;
-		goto error_return;
-	}
-	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!gdp) {
-		err = -EIO;
-		goto error_return;
-	}
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+				     GFP_NOFS|__GFP_NOFAIL);
+	if (err)
+		goto error_out;
 
 	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
 	    !ext4_inode_block_valid(inode, block, count)) {
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
-		goto error_return;
+		goto error_clean;
 	}
 
-	BUFFER_TRACE(bitmap_bh, "getting write access");
-	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-					    EXT4_JTR_NONE);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
-	if (err)
-		goto error_return;
 #ifdef AGGRESSIVE_CHECK
-	{
-		int i;
-		for (i = 0; i < count_clusters; i++)
-			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
-	}
+	mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
 #endif
-	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+	err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+				   count_clusters, mark_flags, &changed);
 
-	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
-	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
-				     GFP_NOFS|__GFP_NOFAIL);
-	if (err)
-		goto error_return;
+
+	if (err && changed == 0)
+		goto error_clean;
+
+#ifdef AGGRESSIVE_CHECK
+	BUG_ON(changed != count_clusters);
+#endif
 
 	/*
 	 * We need to make sure we don't reuse the freed block until after the
@@ -6555,13 +6471,8 @@ do_more:
 		new_entry->efd_tid = handle->h_transaction->t_tid;
 
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
 		ext4_mb_free_metadata(handle, &e4b, new_entry);
 	} else {
-		/* need to update group_info->bb_free and bitmap
-		 * with group lock held. generate_buddy look at
-		 * them with group lock_held
-		 */
 		if (test_opt(sb, DISCARD)) {
 			err = ext4_issue_discard(sb, block_group, bit,
 						 count_clusters, NULL);
@@ -6574,23 +6485,11 @@ do_more:
 			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
 
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
 		mb_free_blocks(inode, &e4b, bit, count_clusters);
 	}
 
-	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
-	ext4_free_group_clusters_set(sb, gdp, ret);
-	ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
-	ext4_group_desc_csum_set(sb, block_group, gdp);
 	ext4_unlock_group(sb, block_group);
 
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic64_add(count_clusters,
-			     &sbi_array_rcu_deref(sbi, s_flex_groups,
-						  flex_group)->free_clusters);
-	}
-
 	/*
 	 * on a bigalloc file system, defer the s_freeclusters_counter
 	 * update to the caller (ext4_remove_space and friends) so they
@@ -6603,28 +6502,18 @@ do_more:
 				   count_clusters);
 	}
 
-	ext4_mb_unload_buddy(&e4b);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-	if (!err)
-		err = ret;
-
 	if (overflow && !err) {
 		block += count;
 		count = overflow;
-		put_bh(bitmap_bh);
+		ext4_mb_unload_buddy(&e4b);
 		/* The range changed so it's no longer validated */
 		flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
 		goto do_more;
 	}
-error_return:
-	brelse(bitmap_bh);
+
+error_clean:
+	ext4_mb_unload_buddy(&e4b);
+error_out:
 	ext4_std_error(sb, err);
 }
 
@@ -6742,23 +6631,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 			 ext4_fsblk_t block, unsigned long count)
 {
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	ext4_grpblk_t bit;
-	unsigned int i;
-	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_buddy e4b;
-	int err = 0, ret, free_clusters_count;
-	ext4_grpblk_t clusters_freed;
+	int err = 0;
 	ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
 	ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
 	unsigned long cluster_count = last_cluster - first_cluster + 1;
+	ext4_grpblk_t changed;
 
 	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-	if (count == 0)
+	if (cluster_count == 0)
 		return 0;
 
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -6770,99 +6655,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 		ext4_warning(sb, "too many blocks added to group %u",
 			     block_group);
 		err = -EINVAL;
-		goto error_return;
-	}
-
-	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (IS_ERR(bitmap_bh)) {
-		err = PTR_ERR(bitmap_bh);
-		bitmap_bh = NULL;
-		goto error_return;
+		goto error_out;
 	}
 
-	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!desc) {
-		err = -EIO;
-		goto error_return;
-	}
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_out;
 
 	if (!ext4_sb_block_valid(sb, NULL, block, count)) {
 		ext4_error(sb, "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
 		err = -EINVAL;
-		goto error_return;
+		goto error_clean;
 	}
 
-	BUFFER_TRACE(bitmap_bh, "getting write access");
-	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
-					    EXT4_JTR_NONE);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
-	if (err)
-		goto error_return;
-
-	for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
-		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
-			ext4_error(sb, "bit already cleared for block %llu",
-				   (ext4_fsblk_t)(block + i));
-			BUFFER_TRACE(bitmap_bh, "bit already cleared");
-		} else {
-			clusters_freed++;
-		}
-	}
+	err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
+				   cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
+				   &changed);
+	if (err && changed == 0)
+		goto error_clean;
 
-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
-	if (err)
-		goto error_return;
+	if (changed != cluster_count)
+		ext4_error(sb, "bit already cleared in group %u", block_group);
 
-	/*
-	 * need to update group_info->bb_free and bitmap
-	 * with group lock held. generate_buddy look at
-	 * them with group lock_held
-	 */
 	ext4_lock_group(sb, block_group);
-	mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
 	mb_free_blocks(NULL, &e4b, bit, cluster_count);
-	free_clusters_count = clusters_freed +
-		ext4_free_group_clusters(sb, desc);
-	ext4_free_group_clusters_set(sb, desc, free_clusters_count);
-	ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
-	ext4_group_desc_csum_set(sb, block_group, desc);
 	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeclusters_counter,
-			   clusters_freed);
-
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic64_add(clusters_freed,
-			     &sbi_array_rcu_deref(sbi, s_flex_groups,
-						  flex_group)->free_clusters);
-	}
+			   changed);
 
+error_clean:
 	ext4_mb_unload_buddy(&e4b);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-	if (!err)
-		err = ret;
-
-error_return:
-	brelse(bitmap_bh);
+error_out:
 	ext4_std_error(sb, err);
 	return err;
 }
@@ -7170,3 +6995,7 @@ out_unload:
 
 	return error;
 }
+
+#ifdef CONFIG_EXT4_KUNIT_TESTS
+#include "mballoc-test.c"
+#endif
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 18a9e7c47975..3aa57376d9c2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
 
 	blocksize = i_blocksize(inode);
 	head = folio_buffers(folio);
-	if (!head) {
-		create_empty_buffers(&folio->page, blocksize, 0);
-		head = folio_buffers(folio);
-	}
+	if (!head)
+		head = create_empty_buffers(folio, blocksize, 0);
 
 	block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
 	for (bh = head, block_start = 0; bh != head || !block_start;
@@ -380,9 +378,10 @@ data_copy:
 	}
 	/* Perform all necessary steps similar write_begin()/write_end()
 	 * but keeping in mind that i_size will not change */
-	if (!folio_buffers(folio[0]))
-		create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
 	bh = folio_buffers(folio[0]);
+	if (!bh)
+		bh = create_empty_buffers(folio[0],
+				1 << orig_inode->i_blkbits, 0);
 	for (i = 0; i < data_offset_in_page; i++)
 		bh = bh->b_this_page;
 	for (i = 0; i < block_len_in_page; i++) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bbda587f76b8..d252935f9c8a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2207,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	ext4_update_dx_flag(dir);
 	inode_inc_iversion(dir);
 	err2 = ext4_mark_inode_dirty(handle, dir);
@@ -2280,8 +2280,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 	top = data2 + len;
 	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
 		if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
-					 (data2 + (blocksize - csum_size) -
-					  (char *) de))) {
+					(char *)de - data2)) {
 			brelse(bh2);
 			brelse(bh);
 			return -EFSCORRUPTED;
@@ -3202,7 +3201,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	inode_set_ctime_current(inode);
 	retval = ext4_mark_inode_dirty(handle, inode);
 	if (retval)
@@ -3277,7 +3276,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
 		retval = ext4_delete_entry(handle, dir, de, bh);
 		if (retval)
 			goto out_handle;
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		ext4_update_dx_flag(dir);
 		retval = ext4_mark_inode_dirty(handle, dir);
 		if (retval)
@@ -3648,7 +3647,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 	if (ext4_has_feature_filetype(ent->dir->i_sb))
 		ent->de->file_type = file_type;
 	inode_inc_iversion(ent->dir);
-	ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
+	inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
 	retval = ext4_mark_inode_dirty(handle, ent->dir);
 	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
 	if (!ent->inlined) {
@@ -3963,7 +3962,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		ext4_dec_count(new.inode);
 		inode_set_ctime_current(new.inode);
 	}
-	old.dir->i_mtime = inode_set_ctime_current(old.dir);
+	inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
 	ext4_update_dx_flag(old.dir);
 	if (old.dir_bh) {
 		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3e7d160f543f..21e8f0aebb3c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio)
 {
 	struct folio_iter fi;
 
-	bio_for_each_folio_all(fi, bio) {
-		struct folio *folio = fi.folio;
-
-		if (bio->bi_status)
-			folio_clear_uptodate(folio);
-		else
-			folio_mark_uptodate(folio);
-		folio_unlock(folio);
-	}
+	bio_for_each_folio_all(fi, bio)
+		folio_end_read(fi.folio, bio->bi_status == 0);
 	if (bio->bi_private)
 		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
 	bio_put(bio);
@@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode,
 				if (ext4_need_verity(inode, folio->index) &&
 				    !fsverity_verify_folio(folio))
 					goto set_error_page;
-				folio_mark_uptodate(folio);
-				folio_unlock(folio);
+				folio_end_read(folio, true);
 				continue;
 			}
 		} else if (fully_mapped) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0361c20910de..4fe061edefdd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -10,8 +10,6 @@
  */
 
 
-#define EXT4FS_DEBUG
-
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
@@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb)
 	 * If the reserved GDT blocks is non-zero, the resize_inode feature
 	 * should always be set.
 	 */
-	if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+	if (sbi->s_es->s_reserved_gdt_blocks &&
 	    !ext4_has_feature_resize_inode(sb)) {
 		ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
 		return -EFSCORRUPTED;
@@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb)
          * bad time to do it anyways.
          */
 	if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
-	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+	    le32_to_cpu(sbi->s_es->s_first_data_block)) {
 		ext4_warning(sb, "won't resize using backup superblock at %llu",
-			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+			(unsigned long long)sbi->s_sbh->b_blocknr);
 		return -EPERM;
 	}
 
@@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb)
 	 * We are not allowed to do online-resizing on a filesystem mounted
 	 * with error, because it can destroy the filesystem easily.
 	 */
-	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+	if (sbi->s_mount_state & EXT4_ERROR_FS) {
 		ext4_warning(sb, "There are errors in the filesystem, "
 			     "so online resizing is not allowed");
 		return -EPERM;
@@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb)
 	}
 
 	if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
-				  &EXT4_SB(sb)->s_ext4_flags))
+				  &sbi->s_ext4_flags))
 		ret = -EBUSY;
 
 	return ret;
@@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups)
 	return 0;
 }
 
-static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
-					     ext4_group_t group) {
-	return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
-	       EXT4_DESC_PER_BLOCK_BITS(sb);
-}
-
-static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
-					     ext4_group_t group) {
-	group = ext4_meta_bg_first_group(sb, group);
-	return ext4_group_first_block_no(sb, group);
-}
-
 static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
 						ext4_group_t group) {
 	ext4_grpblk_t overhead;
@@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb,
 
 	overhead = ext4_group_overhead_blocks(sb, group);
 	metaend = start + overhead;
-	input->free_clusters_count = free_blocks_count =
-		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+	free_blocks_count = input->blocks_count - 2 - overhead -
+			    sbi->s_itb_per_group;
+	input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count);
 
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
@@ -460,8 +447,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 
 	ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
 		   last_cluster);
-	for (count2 = count; count > 0;
-	     count -= count2, first_cluster += count2) {
+	for (; count > 0; count -= count2, first_cluster += count2) {
 		ext4_fsblk_t start;
 		struct buffer_head *bh;
 		ext4_group_t group;
@@ -560,13 +546,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 		if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
 			goto handle_itb;
 
-		if (meta_bg == 1) {
-			ext4_group_t first_group;
-			first_group = ext4_meta_bg_first_group(sb, group);
-			if (first_group != group + 1 &&
-			    first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
-				goto handle_itb;
-		}
+		if (meta_bg == 1)
+			goto handle_itb;
 
 		block = start + ext4_bg_has_super(sb, group);
 		/* Copy all of the GDT blocks into the backup in this group */
@@ -614,7 +595,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 		}
 
 handle_itb:
-		/* Initialize group tables of the grop @group */
+		/* Initialize group tables of the group @group */
 		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
 			goto handle_bb;
 
@@ -704,16 +685,14 @@ handle_ib:
 			block = start;
 		}
 
-		if (count) {
-			err = set_flexbg_block_bitmap(sb, handle,
-						      flex_gd,
-						      EXT4_B2C(sbi, start),
-						      EXT4_B2C(sbi,
-							       start + count
-							       - 1));
-			if (err)
-				goto out;
-		}
+		err = set_flexbg_block_bitmap(sb, handle,
+				flex_gd,
+				EXT4_B2C(sbi, start),
+				EXT4_B2C(sbi,
+					start + count
+					- 1));
+		if (err)
+			goto out;
 	}
 
 out:
@@ -952,7 +931,13 @@ errout:
 }
 
 /*
- * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ * If there is no available space in the existing block group descriptors for
+ * the new block group and there are no reserved block group descriptors, then
+ * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set
+ * to the first block group that is managed using meta_bg and s_first_meta_bg
+ * must be a multiple of EXT4_DESC_PER_BLOCK(sb).
+ * This function will be called when first group of meta_bg is added to bring
+ * new group descriptors block of new added meta_bg.
  */
 static int add_new_gdb_meta_bg(struct super_block *sb,
 			       handle_t *handle, ext4_group_t group) {
@@ -962,8 +947,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 	int err;
 
-	gdblock = ext4_meta_bg_first_block_no(sb, group) +
-		   ext4_bg_has_super(sb, group);
+	gdblock = ext4_group_first_block_no(sb, group) +
+		  ext4_bg_has_super(sb, group);
 	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
 	if (IS_ERR(gdb_bh))
 		return PTR_ERR(gdb_bh);
@@ -1087,9 +1072,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (i = 0; i < reserved_gdb; i++) {
 		int err2;
 		data = (__le32 *)primary[i]->b_data;
-		/* printk("reserving backup %lu[%u] = %lu\n",
-		       primary[i]->b_blocknr, gdbackups,
-		       blk + primary[i]->b_blocknr); */
 		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
 		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
 		if (!err)
@@ -1191,8 +1173,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
 			   ext4_group_first_block_no(sb, group));
 		BUFFER_TRACE(bh, "get_write_access");
 		if ((err = ext4_journal_get_write_access(handle, sb, bh,
-							 EXT4_JTR_NONE)))
+							 EXT4_JTR_NONE))) {
+			brelse(bh);
 			break;
+		}
 		lock_buffer(bh);
 		memcpy(bh->b_data, data, size);
 		if (rest)
@@ -1601,7 +1585,8 @@ exit_journal:
 		int gdb_num_end = ((group + flex_gd->count - 1) /
 				   EXT4_DESC_PER_BLOCK(sb));
 		int meta_bg = ext4_has_feature_meta_bg(sb);
-		sector_t old_gdb = 0;
+		sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
+					 ext4_group_first_block_no(sb, 0);
 
 		update_backups(sb, ext4_group_first_block_no(sb, 0),
 			       (char *)es, sizeof(struct ext4_super_block), 0);
@@ -1610,11 +1595,8 @@ exit_journal:
 
 			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
 						     gdb_num);
-			if (old_gdb == gdb_bh->b_blocknr)
-				continue;
-			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
-				       gdb_bh->b_size, meta_bg);
-			old_gdb = gdb_bh->b_blocknr;
+			update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
+				       gdb_bh->b_data, gdb_bh->b_size, meta_bg);
 		}
 	}
 exit:
@@ -1980,9 +1962,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
 
 errout:
 	ret = ext4_journal_stop(handle);
-	if (!err)
-		err = ret;
-	return ret;
+	return err ? err : ret;
 
 invalid_resize_inode:
 	ext4_error(sb, "corrupted/inconsistent resize inode");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dbebd8b3127e..c5fcf377ab1f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,18 +244,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 				   blk_opf_t op_flags)
 {
-	return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+			~__GFP_FS) | __GFP_MOVABLE;
+
+	return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
 }
 
 struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 					    sector_t block)
 {
-	return __ext4_sb_bread_gfp(sb, block, 0, 0);
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+			~__GFP_FS);
+
+	return __ext4_sb_bread_gfp(sb, block, 0, gfp);
 }
 
 void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
 {
-	struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+	struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
+			sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
 
 	if (likely(bh)) {
 		if (trylock_buffer(bh))
@@ -768,7 +775,8 @@ static void update_super_work(struct work_struct *work)
 	 */
 	if (!sb_rdonly(sbi->s_sb) && journal) {
 		struct buffer_head *sbh = sbi->s_sbh;
-		bool call_notify_err;
+		bool call_notify_err = false;
+
 		handle = jbd2_journal_start(journal, 1);
 		if (IS_ERR(handle))
 			goto write_directly;
@@ -1351,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
 
 	sync_blockdev(sb->s_bdev);
 	invalidate_bdev(sb->s_bdev);
-	if (sbi->s_journal_bdev) {
+	if (sbi->s_journal_bdev_handle) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
 		 * floating about in memory - the physical journal device may
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
-		sync_blockdev(sbi->s_journal_bdev);
-		invalidate_bdev(sbi->s_journal_bdev);
+		sync_blockdev(sbi->s_journal_bdev_handle->bdev);
+		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
 	}
 
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1646,6 +1654,7 @@ static const struct super_operations ext4_sops = {
 };
 
 static const struct export_operations ext4_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
@@ -4233,7 +4242,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	 * Add the internal journal blocks whether the journal has been
 	 * loaded or not
 	 */
-	if (sbi->s_journal && !sbi->s_journal_bdev)
+	if (sbi->s_journal && !sbi->s_journal_bdev_handle)
 		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
 	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
 		/* j_inum for internal journal is non-zero */
@@ -5670,9 +5679,9 @@ failed_mount:
 #endif
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 	brelse(sbi->s_sbh);
-	if (sbi->s_journal_bdev) {
-		invalidate_bdev(sbi->s_journal_bdev);
-		blkdev_put(sbi->s_journal_bdev, sb);
+	if (sbi->s_journal_bdev_handle) {
+		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+		bdev_release(sbi->s_journal_bdev_handle);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5842,12 +5851,13 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
 	return journal;
 }
 
-static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
 					dev_t j_dev, ext4_fsblk_t *j_start,
 					ext4_fsblk_t *j_len)
 {
 	struct buffer_head *bh;
 	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
@@ -5856,16 +5866,17 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
 
 	/* see get_tree_bdev why this is needed and safe */
 	up_write(&sb->s_umount);
-	bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
-				 &fs_holder_ops);
+	bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				       sb, &fs_holder_ops);
 	down_write(&sb->s_umount);
-	if (IS_ERR(bdev)) {
+	if (IS_ERR(bdev_handle)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
-			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
-		return ERR_CAST(bdev);
+			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
+		return bdev_handle;
 	}
 
+	bdev = bdev_handle->bdev;
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -5912,12 +5923,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
 	*j_start = sb_block + 1;
 	*j_len = ext4_blocks_count(es);
 	brelse(bh);
-	return bdev;
+	return bdev_handle;
 
 out_bh:
 	brelse(bh);
 out_bdev:
-	blkdev_put(bdev, sb);
+	bdev_release(bdev_handle);
 	return ERR_PTR(errno);
 }
 
@@ -5927,14 +5938,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 	journal_t *journal;
 	ext4_fsblk_t j_start;
 	ext4_fsblk_t j_len;
-	struct block_device *journal_bdev;
+	struct bdev_handle *bdev_handle;
 	int errno = 0;
 
-	journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
-	if (IS_ERR(journal_bdev))
-		return ERR_CAST(journal_bdev);
+	bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+	if (IS_ERR(bdev_handle))
+		return ERR_CAST(bdev_handle);
 
-	journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+	journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
 					j_len, sb->s_blocksize);
 	if (IS_ERR(journal)) {
 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5960,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 		goto out_journal;
 	}
 	journal->j_private = sb;
-	EXT4_SB(sb)->s_journal_bdev = journal_bdev;
+	EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
-	blkdev_put(journal_bdev, sb);
+	bdev_release(bdev_handle);
 	return ERR_PTR(errno);
 }
 
@@ -6442,6 +6453,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
 	struct ext4_mount_options old_opts;
 	ext4_group_t g;
 	int err = 0;
+	int alloc_ctx;
 #ifdef CONFIG_QUOTA
 	int enable_quota = 0;
 	int i, j;
@@ -6482,7 +6494,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
 
 	}
 
+	/*
+	 * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
+	 * two calls to ext4_should_dioread_nolock() to return inconsistent
+	 * values, triggering WARN_ON in ext4_add_complete_io(). we grab
+	 * here s_writepages_rwsem to avoid race between writepages ops and
+	 * remount.
+	 */
+	alloc_ctx = ext4_writepages_down_write(sb);
 	ext4_apply_options(fc, sb);
+	ext4_writepages_up_write(sb, alloc_ctx);
 
 	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
 	    test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -6700,6 +6721,8 @@ restore_opts:
 	if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
 	    sb_any_quota_suspended(sb))
 		dquot_resume(sb, -1);
+
+	alloc_ctx = ext4_writepages_down_write(sb);
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6708,6 +6731,8 @@ restore_opts:
 	sbi->s_commit_interval = old_opts.s_commit_interval;
 	sbi->s_min_batch_time = old_opts.s_min_batch_time;
 	sbi->s_max_batch_time = old_opts.s_max_batch_time;
+	ext4_writepages_up_write(sb, alloc_ctx);
+
 	if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
 		ext4_release_system_zone(sb);
 #ifdef CONFIG_QUOTA
@@ -7127,7 +7152,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	}
 	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
 	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out_unlock:
@@ -7300,12 +7325,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 static void ext4_kill_sb(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+	struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
 
 	kill_block_super(sb);
 
-	if (journal_bdev)
-		blkdev_put(journal_bdev, sb);
+	if (handle)
+		bdev_release(handle);
 }
 
 static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 92ba28cebac6..82dc5e673d5c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_HURD]		     = &ext4_xattr_hurd_handler,
 };
 
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SECURITY
@@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
 
 static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
 {
-	return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
+	return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
 		(u32) inode_peek_iversion_raw(ea_inode);
 }
 
@@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
 
 static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
 {
-	return (u32)ea_inode->i_atime.tv_sec;
+	return (u32) inode_get_atime_sec(ea_inode);
 }
 
 static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
 {
-	ea_inode->i_atime.tv_sec = hash;
+	inode_set_atime(ea_inode, hash, 0);
 }
 
 /*
@@ -418,7 +418,7 @@ free_bhs:
 	return ret;
 }
 
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
 
 static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 				 u32 ea_inode_hash, struct inode **ea_inode)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 824faf0b15a8..bd97c4aa8177 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 extern void ext4_evict_ea_inode(struct inode *inode);
 
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
 
 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 				 struct ext4_xattr_ibody_find *is);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d820801f473e..36e5dab6baae 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -893,14 +893,15 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
 
 bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
 {
+#ifdef CONFIG_F2FS_CHECK_FS
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
-	bool compressed = dn->data_blkaddr == COMPRESS_ADDR;
 	int cluster_end = 0;
+	unsigned int count;
 	int i;
 	char *reason = "";
 
-	if (!compressed)
+	if (dn->data_blkaddr != COMPRESS_ADDR)
 		return false;
 
 	/* [..., COMPR_ADDR, ...] */
@@ -909,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
 		goto out;
 	}
 
-	for (i = 1; i < cluster_size; i++) {
+	for (i = 1, count = 1; i < cluster_size; i++, count++) {
 		block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
 							dn->ofs_in_node + i);
 
@@ -929,19 +930,42 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
 			goto out;
 		}
 	}
+
+	f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size &&
+		!is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED));
+
 	return false;
 out:
 	f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s",
 			dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason);
 	set_sbi_flag(sbi, SBI_NEED_FSCK);
 	return true;
+#else
+	return false;
+#endif
+}
+
+static int __f2fs_get_cluster_blocks(struct inode *inode,
+					struct dnode_of_data *dn)
+{
+	unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+	int count, i;
+
+	for (i = 1, count = 1; i < cluster_size; i++) {
+		block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+							dn->ofs_in_node + i);
+
+		if (__is_valid_data_blkaddr(blkaddr))
+			count++;
+	}
+
+	return count;
 }
 
 static int __f2fs_cluster_blocks(struct inode *inode,
-				unsigned int cluster_idx, bool compr)
+				unsigned int cluster_idx, bool compr_blks)
 {
 	struct dnode_of_data dn;
-	unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
 	unsigned int start_idx = cluster_idx <<
 				F2FS_I(inode)->i_log_cluster_size;
 	int ret;
@@ -956,31 +980,14 @@ static int __f2fs_cluster_blocks(struct inode *inode,
 
 	if (f2fs_sanity_check_cluster(&dn)) {
 		ret = -EFSCORRUPTED;
-		f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER);
 		goto fail;
 	}
 
 	if (dn.data_blkaddr == COMPRESS_ADDR) {
-		int i;
-
-		ret = 1;
-		for (i = 1; i < cluster_size; i++) {
-			block_t blkaddr;
-
-			blkaddr = data_blkaddr(dn.inode,
-					dn.node_page, dn.ofs_in_node + i);
-			if (compr) {
-				if (__is_valid_data_blkaddr(blkaddr))
-					ret++;
-			} else {
-				if (blkaddr != NULL_ADDR)
-					ret++;
-			}
-		}
-
-		f2fs_bug_on(F2FS_I_SB(inode),
-			!compr && ret != cluster_size &&
-			!is_inode_flag_set(inode, FI_COMPRESS_RELEASED));
+		if (compr_blks)
+			ret = __f2fs_get_cluster_blocks(inode, &dn);
+		else
+			ret = 1;
 	}
 fail:
 	f2fs_put_dnode(&dn);
@@ -993,7 +1000,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
 	return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true);
 }
 
-/* return # of valid blocks in compressed cluster */
+/* return whether cluster is compressed one or not */
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
 {
 	return __f2fs_cluster_blocks(inode,
@@ -1976,7 +1983,7 @@ void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi)
 int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
-	char slab_name[32];
+	char slab_name[35];
 
 	if (!f2fs_sb_has_compression(sbi))
 		return 0;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 916e317ac925..4e42b5f24deb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1690,9 +1690,7 @@ next_block:
 			map->m_flags |= F2FS_MAP_NEW;
 	} else if (is_hole) {
 		if (f2fs_compressed_file(inode) &&
-		    f2fs_sanity_check_cluster(&dn) &&
-		    (flag != F2FS_GET_BLOCK_FIEMAP ||
-		     IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+		    f2fs_sanity_check_cluster(&dn)) {
 			err = -EFSCORRUPTED;
 			f2fs_handle_error(sbi,
 					ERROR_CORRUPTED_CLUSTER);
@@ -2344,8 +2342,10 @@ skip_reading_dnode:
 		f2fs_wait_on_block_writeback(inode, blkaddr);
 
 		if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
-			if (atomic_dec_and_test(&dic->remaining_pages))
+			if (atomic_dec_and_test(&dic->remaining_pages)) {
 				f2fs_decompress_cluster(dic, true);
+				break;
+			}
 			continue;
 		}
 
@@ -2665,6 +2665,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
 		return true;
 	if (f2fs_is_atomic_file(inode))
 		return true;
+	/* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */
+	if (f2fs_compressed_file(inode) &&
+		F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER &&
+		is_inode_flag_set(inode, FI_ENABLE_COMPRESS))
+		return true;
 
 	/* swap file is migrating in aligned write mode */
 	if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -3023,7 +3028,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 {
 	int ret = 0;
 	int done = 0, retry = 0;
-	struct page *pages[F2FS_ONSTACK_PAGES];
+	struct page *pages_local[F2FS_ONSTACK_PAGES];
+	struct page **pages = pages_local;
 	struct folio_batch fbatch;
 	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 	struct bio *bio = NULL;
@@ -3047,6 +3053,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 #endif
 	int nr_folios, p, idx;
 	int nr_pages;
+	unsigned int max_pages = F2FS_ONSTACK_PAGES;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
 	pgoff_t done_index;
@@ -3056,6 +3063,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int submitted = 0;
 	int i;
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (f2fs_compressed_file(inode) &&
+		1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) {
+		pages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+				cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL);
+		max_pages = 1 << cc.log_cluster_size;
+	}
+#endif
+
 	folio_batch_init(&fbatch);
 
 	if (get_dirty_pages(mapping->host) <=
@@ -3101,7 +3117,7 @@ again:
 add_more:
 			pages[nr_pages] = folio_page(folio, idx);
 			folio_get(folio);
-			if (++nr_pages == F2FS_ONSTACK_PAGES) {
+			if (++nr_pages == max_pages) {
 				index = folio->index + idx + 1;
 				folio_batch_release(&fbatch);
 				goto write;
@@ -3283,6 +3299,11 @@ next:
 	if (bio)
 		f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
 
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (pages != pages_local)
+		kfree(pages);
+#endif
+
 	return ret;
 }
 
@@ -4055,7 +4076,7 @@ next:
 	sis->highest_bit = cur_lblock - 1;
 out:
 	if (not_aligned)
-		f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)",
+		f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)",
 			  not_aligned, blks_per_sec * F2FS_BLKSIZE);
 	return ret;
 }
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8aa29fe2e87b..042593aed1ec 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	de->file_type = fs_umode_to_ftype(inode->i_mode);
 	set_page_dirty(page);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 	f2fs_put_page(page, 1);
 }
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
 			f2fs_i_links_write(dir, true);
 		clear_inode_flag(inode, FI_NEW_INODE);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	}
 	f2fs_put_page(page, 1);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 0e2d49140c07..ad8dfac73bd4 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -74,40 +74,14 @@ static void __set_extent_info(struct extent_info *ei,
 	}
 }
 
-static bool __may_read_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
-	if (!test_opt(sbi, READ_EXTENT_CACHE))
-		return false;
-	if (is_inode_flag_set(inode, FI_NO_EXTENT))
-		return false;
-	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
-			 !f2fs_sb_has_readonly(sbi))
-		return false;
-	return S_ISREG(inode->i_mode);
-}
-
-static bool __may_age_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
-	if (!test_opt(sbi, AGE_EXTENT_CACHE))
-		return false;
-	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
-		return false;
-	if (file_is_cold(inode))
-		return false;
-
-	return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
-}
-
 static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
 {
 	if (type == EX_READ)
-		return __may_read_extent_tree(inode);
-	else if (type == EX_BLOCK_AGE)
-		return __may_age_extent_tree(inode);
+		return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) &&
+			S_ISREG(inode->i_mode);
+	if (type == EX_BLOCK_AGE)
+		return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) &&
+			(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode));
 	return false;
 }
 
@@ -120,7 +94,22 @@ static bool __may_extent_tree(struct inode *inode, enum extent_type type)
 	if (list_empty(&F2FS_I_SB(inode)->s_list))
 		return false;
 
-	return __init_may_extent_tree(inode, type);
+	if (!__init_may_extent_tree(inode, type))
+		return false;
+
+	if (type == EX_READ) {
+		if (is_inode_flag_set(inode, FI_NO_EXTENT))
+			return false;
+		if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+				 !f2fs_sb_has_readonly(F2FS_I_SB(inode)))
+			return false;
+	} else if (type == EX_BLOCK_AGE) {
+		if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+			return false;
+		if (file_is_cold(inode))
+			return false;
+	}
+	return true;
 }
 
 static void __try_update_largest_extent(struct extent_tree *et,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6d688e42d89c..9043cedfa12b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1234,6 +1234,7 @@ struct f2fs_bio_info {
 #define FDEV(i)				(sbi->devs[i])
 #define RDEV(i)				(raw_super->devs[i])
 struct f2fs_dev_info {
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 	char path[MAX_PATH_LEN];
 	unsigned int total_segments;
@@ -3317,13 +3318,15 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_is_time_consistent(struct inode *inode)
 {
-	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 ts = inode_get_atime(inode);
 
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts))
 		return false;
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts))
 		return false;
-	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts))
 		return false;
 	return true;
 }
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ca5904129b16..e50363583f01 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -798,7 +798,7 @@ int f2fs_truncate(struct inode *inode)
 	if (err)
 		return err;
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return 0;
 }
@@ -905,9 +905,9 @@ static void __setattr_copy(struct mnt_idmap *idmap,
 	i_uid_update(idmap, attr, inode);
 	i_gid_update(idmap, attr, inode);
 	if (ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (ia_valid & ATTR_MODE) {
@@ -1012,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			return err;
 
 		spin_lock(&F2FS_I(inode)->i_size_lock);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		F2FS_I(inode)->last_disk_size = i_size_read(inode);
 		spin_unlock(&F2FS_I(inode)->i_size_lock);
 	}
@@ -1840,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 	}
 
 	if (!ret) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		f2fs_mark_inode_dirty_sync(inode, false);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
@@ -2888,10 +2888,10 @@ out_src:
 	if (ret)
 		goto out_unlock;
 
-	src->i_mtime = inode_set_ctime_current(src);
+	inode_set_mtime_to_ts(src, inode_set_ctime_current(src));
 	f2fs_mark_inode_dirty_sync(src, false);
 	if (src != dst) {
-		dst->i_mtime = inode_set_ctime_current(dst);
+		inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst));
 		f2fs_mark_inode_dirty_sync(dst, false);
 	}
 	f2fs_update_time(sbi, REQ_TIME);
@@ -3258,11 +3258,12 @@ int f2fs_precache_extents(struct inode *inode)
 		return -EOPNOTSUPP;
 
 	map.m_lblk = 0;
+	map.m_pblk = 0;
 	map.m_next_pgofs = NULL;
 	map.m_next_extent = &m_next_extent;
 	map.m_seg_type = NO_CHECK_TYPE;
 	map.m_may_create = false;
-	end = max_file_blocks(inode);
+	end = F2FS_BLK_ALIGN(i_size_read(inode));
 
 	while (map.m_lblk < end) {
 		map.m_len = end - map.m_lblk;
@@ -3270,7 +3271,7 @@ int f2fs_precache_extents(struct inode *inode)
 		f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
 		err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE);
 		f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
-		if (err)
+		if (err || !map.m_len)
 			return err;
 
 		map.m_lblk = m_next_extent;
@@ -4005,6 +4006,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
 	F2FS_I(inode)->i_compress_algorithm = option.algorithm;
 	F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
 	F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
+	/* Set default level */
+	if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD)
+		F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+	else
+		F2FS_I(inode)->i_compress_level = 0;
+	/* Adjust mount option level */
+	if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm &&
+	    F2FS_OPTION(sbi).compress_level)
+		F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level;
 	f2fs_mark_inode_dirty_sync(inode, true);
 
 	if (!f2fs_is_compress_backend_ready(inode))
@@ -4849,6 +4859,9 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
 		filp->f_mode &= ~FMODE_RANDOM;
 		spin_unlock(&filp->f_lock);
 		return 0;
+	} else if (advice == POSIX_FADV_WILLNEED && offset == 0) {
+		/* Load extent cache at the first readahead. */
+		f2fs_precache_extents(inode);
 	}
 
 	err = generic_fadvise(filp, offset, len, advice);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2fe25619ccb5..ac00423f117b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -699,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cde243840abd..560bfcad1af2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 			f2fs_has_inline_xattr(inode) &&
 			(!fi->i_inline_xattr_size ||
 			fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
-			f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
+			f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu",
 				  __func__, inode->i_ino, fi->i_inline_xattr_size,
 				  MAX_INLINE_XATTR_SIZE);
 			return false;
@@ -386,9 +386,9 @@ static void init_idisk_time(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	fi->i_disk_time[0] = inode->i_atime;
+	fi->i_disk_time[0] = inode_get_atime(inode);
 	fi->i_disk_time[1] = inode_get_ctime(inode);
-	fi->i_disk_time[2] = inode->i_mtime;
+	fi->i_disk_time[2] = inode_get_mtime(inode);
 }
 
 static int do_read_inode(struct inode *inode)
@@ -417,12 +417,12 @@ static int do_read_inode(struct inode *inode)
 	inode->i_size = le64_to_cpu(ri->i_size);
 	inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
 
-	inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+	inode_set_atime(inode, le64_to_cpu(ri->i_atime),
+			le32_to_cpu(ri->i_atime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
 			le32_to_cpu(ri->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(ri->i_mtime),
+			le32_to_cpu(ri->i_mtime_nsec));
 	inode->i_generation = le32_to_cpu(ri->i_generation);
 	if (S_ISDIR(inode->i_mode))
 		fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
@@ -698,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	}
 	set_raw_inline(inode, ri);
 
-	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	if (S_ISDIR(inode->i_mode))
 		ri->i_current_depth =
 			cpu_to_le32(F2FS_I(inode)->i_current_depth);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 193b22a2d6bf..d0053b0284d8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
 
 	inode->i_ino = ino;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	F2FS_I(inode)->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	F2FS_I(inode)->i_crtime = inode_get_mtime(inode);
 	inode->i_generation = get_random_u32();
 
 	if (S_ISDIR(inode->i_mode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ee2e1dd64f25..6c7f6a649d27 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n)
 
 	/* Then, try readahead for siblings of the desired node */
 	end = start + n;
-	end = min(end, NIDS_PER_BLOCK);
+	end = min(end, (int)NIDS_PER_BLOCK);
 	for (i = start; i < end; i++) {
 		nid = get_nid(parent, i, false);
 		f2fs_ra_node_page(sbi, nid);
@@ -1467,7 +1467,8 @@ page_hit:
 			  ofs_of_node(page), cpver_of_node(page),
 			  next_blkaddr_of_node(page));
 	set_sbi_flag(sbi, SBI_NEED_FSCK);
-	err = -EINVAL;
+	f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+	err = -EFSCORRUPTED;
 out_err:
 	ClearPageUptodate(page);
 out_put_err:
@@ -2389,7 +2390,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi,
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 
 		if (blk_addr == NEW_ADDR)
-			return -EINVAL;
+			return -EFSCORRUPTED;
 
 		if (blk_addr == NULL_ADDR) {
 			add_free_nid(sbi, start_nid, true, true);
@@ -2504,7 +2505,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
 
 			if (ret) {
 				f2fs_up_read(&nm_i->nat_tree_lock);
-				f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+
+				if (ret == -EFSCORRUPTED) {
+					f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
+					set_sbi_flag(sbi, SBI_NEED_FSCK);
+					f2fs_handle_error(sbi,
+						ERROR_INCONSISTENT_NAT);
+				}
+
 				return ret;
 			}
 		}
@@ -2743,7 +2751,9 @@ recover_xnid:
 	f2fs_update_inode_page(inode);
 
 	/* 3: update and set xattr node page dirty */
-	memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+	if (page)
+		memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
+				VALID_XATTR_BLOCK_SIZE);
 
 	set_page_dirty(xpage);
 	f2fs_put_page(xpage, 1);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 7be60df277a5..b56d0f1078a7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page)
 	}
 
 	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
-	inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
+	inode_set_atime(inode, le64_to_cpu(raw->i_atime),
+			le32_to_cpu(raw->i_atime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
 			le32_to_cpu(raw->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(raw->i_mtime),
+			le32_to_cpu(raw->i_mtime_nsec));
 
 	F2FS_I(inode)->i_advise = raw->i_advise;
 	F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d05b41608fc0..727d016318f9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -4910,22 +4910,31 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 	}
 
 	/*
-	 * The write pointer matches with the valid blocks or
-	 * already points to the end of the zone.
+	 * When safely unmounted in the previous mount, we can trust write
+	 * pointers. Otherwise, finish zones.
 	 */
-	if ((last_valid_block + 1 == wp_block) ||
-			(zone->wp == zone->start + zone->len))
-		return 0;
-
-	if (last_valid_block + 1 == zone_block) {
+	if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
 		/*
-		 * If there is no valid block in the zone and if write pointer
-		 * is not at zone start, reset the write pointer.
+		 * The write pointer matches with the valid blocks or
+		 * already points to the end of the zone.
 		 */
-		f2fs_notice(sbi,
-			    "Zone without valid block has non-zero write "
-			    "pointer. Reset the write pointer: wp[0x%x,0x%x]",
-			    wp_segno, wp_blkoff);
+		if ((last_valid_block + 1 == wp_block) ||
+				(zone->wp == zone->start + zone->len))
+			return 0;
+	}
+
+	if (last_valid_block + 1 == zone_block) {
+		if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+			/*
+			 * If there is no valid block in the zone and if write
+			 * pointer is not at zone start, reset the write
+			 * pointer.
+			 */
+			f2fs_notice(sbi,
+			      "Zone without valid block has non-zero write "
+			      "pointer. Reset the write pointer: wp[0x%x,0x%x]",
+			      wp_segno, wp_blkoff);
+		}
 		ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
 					zone->len >> log_sectors_per_block);
 		if (ret)
@@ -4935,18 +4944,20 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 		return ret;
 	}
 
-	/*
-	 * If there are valid blocks and the write pointer doesn't
-	 * match with them, we need to report the inconsistency and
-	 * fill the zone till the end to close the zone. This inconsistency
-	 * does not cause write error because the zone will not be selected
-	 * for write operation until it get discarded.
-	 */
-	f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: "
-		    "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
-		    GET_SEGNO(sbi, last_valid_block),
-		    GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
-		    wp_segno, wp_blkoff);
+	if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+		/*
+		 * If there are valid blocks and the write pointer doesn't match
+		 * with them, we need to report the inconsistency and fill
+		 * the zone till the end to close the zone. This inconsistency
+		 * does not cause write error because the zone will not be
+		 * selected for write operation until it get discarded.
+		 */
+		f2fs_notice(sbi, "Valid blocks are not aligned with write "
+			    "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]",
+			    GET_SEGNO(sbi, last_valid_block),
+			    GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
+			    wp_segno, wp_blkoff);
+	}
 
 	ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
 				zone->start, zone->len, GFP_NOFS);
@@ -5020,18 +5031,27 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
 	if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
 		return 0;
 
-	wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
-	wp_segno = GET_SEGNO(sbi, wp_block);
-	wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
-	wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
-
-	if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
-		wp_sector_off == 0)
-		return 0;
+	/*
+	 * When safely unmounted in the previous mount, we could use current
+	 * segments. Otherwise, allocate new sections.
+	 */
+	if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
+		wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
+		wp_segno = GET_SEGNO(sbi, wp_block);
+		wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+		wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
+
+		if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
+				wp_sector_off == 0)
+			return 0;
 
-	f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
-		    "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
-		    type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
+		f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
+			    "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno,
+			    cs->next_blkoff, wp_segno, wp_blkoff);
+	} else {
+		f2fs_notice(sbi, "Not successfully unmounted in the previous "
+			    "mount");
+	}
 
 	f2fs_notice(sbi, "Assign new section to curseg[%d]: "
 		    "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 2ca8fb5d0dc4..8129be788bd5 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -108,11 +108,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
 	((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
 	(sbi)->log_blocks_per_seg))
 #define GET_SEC_FROM_SEG(sbi, segno)				\
-	(((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
+	(((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
 #define GET_SEG_FROM_SEC(sbi, secno)				\
 	((secno) * (sbi)->segs_per_sec)
 #define GET_ZONE_FROM_SEC(sbi, secno)				\
-	(((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone)
+	(((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
 #define GET_ZONE_FROM_SEG(sbi, segno)				\
 	GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..033af907c3b1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
 #endif
 
 /* f2fs-wide shrinker description */
-static struct shrinker f2fs_shrinker_info = {
-	.scan_objects = f2fs_shrink_scan,
-	.count_objects = f2fs_shrink_count,
-	.seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *f2fs_shrinker_info;
+
+static int __init f2fs_init_shrinker(void)
+{
+	f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker");
+	if (!f2fs_shrinker_info)
+		return -ENOMEM;
+
+	f2fs_shrinker_info->count_objects = f2fs_shrink_count;
+	f2fs_shrinker_info->scan_objects = f2fs_shrink_scan;
+
+	shrinker_register(f2fs_shrinker_info);
+
+	return 0;
+}
+
+static void f2fs_exit_shrinker(void)
+{
+	shrinker_free(f2fs_shrinker_info);
+}
 
 enum {
 	Opt_gc_background,
@@ -547,6 +562,29 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
 }
 
 #ifdef CONFIG_F2FS_FS_COMPRESSION
+static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
+					const char *new_ext, bool is_ext)
+{
+	unsigned char (*ext)[F2FS_EXTENSION_LEN];
+	int ext_cnt;
+	int i;
+
+	if (is_ext) {
+		ext = F2FS_OPTION(sbi).extensions;
+		ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+	} else {
+		ext = F2FS_OPTION(sbi).noextensions;
+		ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+	}
+
+	for (i = 0; i < ext_cnt; i++) {
+		if (!strcasecmp(new_ext, ext[i]))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * 1. The same extension name cannot not appear in both compress and non-compress extension
  * at the same time.
@@ -1149,6 +1187,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 				return -EINVAL;
 			}
 
+			if (is_compress_extension_exist(sbi, name, true)) {
+				kfree(name);
+				break;
+			}
+
 			strcpy(ext[ext_cnt], name);
 			F2FS_OPTION(sbi).compress_ext_cnt++;
 			kfree(name);
@@ -1173,6 +1216,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 				return -EINVAL;
 			}
 
+			if (is_compress_extension_exist(sbi, name, false)) {
+				kfree(name);
+				break;
+			}
+
 			strcpy(noext[noext_cnt], name);
 			F2FS_OPTION(sbi).nocompress_ext_cnt++;
 			kfree(name);
@@ -1562,7 +1610,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			blkdev_put(FDEV(i).bdev, sbi->sb);
+			bdev_release(FDEV(i).bdev_handle);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -1629,7 +1677,7 @@ static void f2fs_put_super(struct super_block *sb)
 
 	f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
 
-	if (err) {
+	if (err || f2fs_cp_error(sbi)) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
 		truncate_inode_pages_final(META_MAPPING(sbi));
 	}
@@ -2286,9 +2334,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_sb_flags;
 	int err;
 	bool need_restart_gc = false, need_stop_gc = false;
-	bool need_restart_ckpt = false, need_stop_ckpt = false;
 	bool need_restart_flush = false, need_stop_flush = false;
 	bool need_restart_discard = false, need_stop_discard = false;
+	bool need_enable_checkpoint = false, need_disable_checkpoint = false;
 	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
 	bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
@@ -2452,24 +2500,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		clear_sbi_flag(sbi, SBI_IS_CLOSE);
 	}
 
-	if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
-			!test_opt(sbi, MERGE_CHECKPOINT)) {
-		f2fs_stop_ckpt_thread(sbi);
-		need_restart_ckpt = true;
-	} else {
-		/* Flush if the prevous checkpoint, if exists. */
-		f2fs_flush_ckpt_thread(sbi);
-
-		err = f2fs_start_ckpt_thread(sbi);
-		if (err) {
-			f2fs_err(sbi,
-			    "Failed to start F2FS issue_checkpoint_thread (%d)",
-			    err);
-			goto restore_gc;
-		}
-		need_stop_ckpt = true;
-	}
-
 	/*
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
@@ -2481,7 +2511,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	} else {
 		err = f2fs_create_flush_cmd_control(sbi);
 		if (err)
-			goto restore_ckpt;
+			goto restore_gc;
 		need_stop_flush = true;
 	}
 
@@ -2503,8 +2533,31 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 			err = f2fs_disable_checkpoint(sbi);
 			if (err)
 				goto restore_discard;
+			need_enable_checkpoint = true;
 		} else {
 			f2fs_enable_checkpoint(sbi);
+			need_disable_checkpoint = true;
+		}
+	}
+
+	/*
+	 * Place this routine at the end, since a new checkpoint would be
+	 * triggered while remount and we need to take care of it before
+	 * returning from remount.
+	 */
+	if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+			!test_opt(sbi, MERGE_CHECKPOINT)) {
+		f2fs_stop_ckpt_thread(sbi);
+	} else {
+		/* Flush if the prevous checkpoint, if exists. */
+		f2fs_flush_ckpt_thread(sbi);
+
+		err = f2fs_start_ckpt_thread(sbi);
+		if (err) {
+			f2fs_err(sbi,
+			    "Failed to start F2FS issue_checkpoint_thread (%d)",
+			    err);
+			goto restore_checkpoint;
 		}
 	}
 
@@ -2522,6 +2575,13 @@ skip:
 	adjust_unusable_cap_perc(sbi);
 	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	return 0;
+restore_checkpoint:
+	if (need_enable_checkpoint) {
+		f2fs_enable_checkpoint(sbi);
+	} else if (need_disable_checkpoint) {
+		if (f2fs_disable_checkpoint(sbi))
+			f2fs_warn(sbi, "checkpoint has not been disabled");
+	}
 restore_discard:
 	if (need_restart_discard) {
 		if (f2fs_start_discard_thread(sbi))
@@ -2537,13 +2597,6 @@ restore_flush:
 		clear_opt(sbi, FLUSH_MERGE);
 		f2fs_destroy_flush_cmd_control(sbi, false);
 	}
-restore_ckpt:
-	if (need_restart_ckpt) {
-		if (f2fs_start_ckpt_thread(sbi))
-			f2fs_warn(sbi, "background ckpt thread has stopped");
-	} else if (need_stop_ckpt) {
-		f2fs_stop_ckpt_thread(sbi);
-	}
 restore_gc:
 	if (need_restart_gc) {
 		if (f2fs_start_gc_thread(sbi))
@@ -2710,7 +2763,7 @@ retry:
 
 	if (len == towrite)
 		return err;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return len - towrite;
 }
@@ -3203,13 +3256,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb)
 	return true;
 }
 
-static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
-				       int *ino_bits_ret, int *lblk_bits_ret)
-{
-	*ino_bits_ret = 8 * sizeof(nid_t);
-	*lblk_bits_ret = 8 * sizeof(block_t);
-}
-
 static struct block_device **f2fs_get_devices(struct super_block *sb,
 					      unsigned int *num_devs)
 {
@@ -3231,13 +3277,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
-	.key_prefix		= "f2fs:",
+	.needs_bounce_pages	= 1,
+	.has_32bit_inodes	= 1,
+	.supports_subblock_data_units = 1,
+	.legacy_key_prefix	= "f2fs:",
 	.get_context		= f2fs_get_context,
 	.set_context		= f2fs_set_context,
 	.get_dummy_policy	= f2fs_get_dummy_policy,
 	.empty_dir		= f2fs_empty_dir,
 	.has_stable_inodes	= f2fs_has_stable_inodes,
-	.get_ino_and_lblk_bits	= f2fs_get_ino_and_lblk_bits,
 	.get_devices		= f2fs_get_devices,
 };
 #endif
@@ -3282,6 +3330,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 static const struct export_operations f2fs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = f2fs_fh_to_dentry,
 	.fh_to_parent = f2fs_fh_to_parent,
 	.get_parent = f2fs_get_parent,
@@ -3469,7 +3518,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return -EFSCORRUPTED;
 	}
 
-	/* Currently, support 512/1024/2048/4096 bytes sector size */
+	/* Currently, support 512/1024/2048/4096/16K bytes sector size */
 	if (le32_to_cpu(raw_super->log_sectorsize) >
 				F2FS_MAX_LOG_SECTOR_SIZE ||
 		le32_to_cpu(raw_super->log_sectorsize) <
@@ -4198,7 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < max_devices; i++) {
 		if (i == 0)
-			FDEV(0).bdev = sbi->sb->s_bdev;
+			FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
 		else if (!RDEV(i).path[0])
 			break;
 
@@ -4218,13 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).end_blk = FDEV(i).start_blk +
 					(FDEV(i).total_segments <<
 					sbi->log_blocks_per_seg) - 1;
-				FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
-					mode, sbi->sb, NULL);
+				FDEV(i).bdev_handle = bdev_open_by_path(
+					FDEV(i).path, mode, sbi->sb, NULL);
 			}
 		}
-		if (IS_ERR(FDEV(i).bdev))
-			return PTR_ERR(FDEV(i).bdev);
+		if (IS_ERR(FDEV(i).bdev_handle))
+			return PTR_ERR(FDEV(i).bdev_handle);
 
+		FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
 		/* to release errored devices */
 		sbi->s_ndevs = i + 1;
 
@@ -4915,7 +4965,7 @@ static int __init init_f2fs_fs(void)
 	int err;
 
 	if (PAGE_SIZE != F2FS_BLKSIZE) {
-		printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n",
+		printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n",
 				PAGE_SIZE, F2FS_BLKSIZE);
 		return -EINVAL;
 	}
@@ -4944,7 +4994,7 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_init_sysfs();
 	if (err)
 		goto free_garbage_collection_cache;
-	err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
+	err = f2fs_init_shrinker();
 	if (err)
 		goto free_sysfs;
 	err = register_filesystem(&f2fs_fs_type);
@@ -4989,7 +5039,7 @@ free_root_stats:
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
-	unregister_shrinker(&f2fs_shrinker_info);
+	f2fs_exit_shrinker();
 free_sysfs:
 	f2fs_exit_sysfs();
 free_garbage_collection_cache:
@@ -5021,7 +5071,7 @@ static void __exit exit_f2fs_fs(void)
 	f2fs_destroy_post_read_processing();
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
-	unregister_shrinker(&f2fs_shrinker_info);
+	f2fs_exit_shrinker();
 	f2fs_exit_sysfs();
 	f2fs_destroy_garbage_collection_cache();
 	f2fs_destroy_extent_cache();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index a657284faee3..47e88b4d4e7d 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = {
 	.set	= f2fs_xattr_generic_set,
 };
 
-static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+static const struct xattr_handler * const f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
 
-const struct xattr_handler *f2fs_xattr_handlers[] = {
+const struct xattr_handler * const f2fs_xattr_handlers[] = {
 	&f2fs_xattr_user_handler,
 	&f2fs_xattr_trusted_handler,
 #ifdef CONFIG_F2FS_FS_SECURITY
@@ -364,10 +364,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
 	if (!*xe) {
-		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+		f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr",
 								inode->i_ino);
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
-		err = -EFSCORRUPTED;
+		err = -ENODATA;
 		f2fs_handle_error(F2FS_I_SB(inode),
 					ERROR_CORRUPTED_XATTR);
 		goto out;
@@ -584,13 +584,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 		if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
 			(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
-			f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+			f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr",
 						inode->i_ino);
 			set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
-			error = -EFSCORRUPTED;
 			f2fs_handle_error(F2FS_I_SB(inode),
 						ERROR_CORRUPTED_XATTR);
-			goto cleanup;
+			break;
 		}
 
 		if (!prefix)
@@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 
 	if (size > MAX_VALUE_LEN(inode))
 		return -E2BIG;
-
+retry:
 	error = read_all_xattrs(inode, ipage, &base_addr);
 	if (error)
 		return error;
@@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	/* find entry with wanted name. */
 	here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
 	if (!here) {
-		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+		if (!F2FS_I(inode)->i_xattr_nid) {
+			f2fs_notice(F2FS_I_SB(inode),
+				"recover xattr in inode (%lu)", inode->i_ino);
+			f2fs_recover_xattr_data(inode, NULL);
+			kfree(base_addr);
+			goto retry;
+		}
+		f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
 								inode->i_ino);
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
 		error = -EFSCORRUPTED;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index b1811c392e6f..a005ffdcf717 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
 extern const struct xattr_handler f2fs_xattr_security_handler;
 
-extern const struct xattr_handler *f2fs_xattr_handlers[];
+extern const struct xattr_handler * const f2fs_xattr_handlers[];
 
 extern int f2fs_setxattr(struct inode *, int, const char *,
 				const void *, size_t, struct page *, int);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index cdd39b6020f3..1fac3dabf130 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir)
 int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct timespec64 mtime;
 	int error;
 
 	MSDOS_I(inode)->i_pos = 0;
@@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
-	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
-	inode_set_ctime_to_ts(inode, inode->i_mtime);
+	fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0);
+	inode_set_mtime_to_ts(inode, mtime);
+	inode_set_ctime_to_ts(inode, mtime);
 	if (sbi->options.isvfat) {
-		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+		struct timespec64 atime;
+
+		fat_time_fat2unix(sbi, &atime, 0, de->adate, 0);
+		inode_set_atime_to_ts(inode, atime);
 		fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
 				  de->cdate, de->ctime_cs);
 	} else
-		inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
+		inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime));
 
 	return 0;
 }
@@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
 	struct msdos_dir_entry *raw_entry;
+	struct timespec64 mtime;
 	loff_t i_pos;
 	sector_t blocknr;
 	int err, offset;
@@ -882,12 +888,14 @@ retry:
 		raw_entry->size = cpu_to_le32(inode->i_size);
 	raw_entry->attr = fat_make_attrs(inode);
 	fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
-	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+	mtime = inode_get_mtime(inode);
+	fat_time_unix2fat(sbi, &mtime, &raw_entry->time,
 			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
+		struct timespec64 ts = inode_get_atime(inode);
 		__le16 atime;
-		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
-				  &raw_entry->adate, NULL);
+
+		fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL);
 		fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
 				  &raw_entry->cdate, &raw_entry->ctime_cs);
 	}
@@ -1407,7 +1415,8 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
 	fat_save_attrs(inode, ATTR_DIR);
-	inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0)));
 	set_nlink(inode, fat_subdirs(inode)+2);
 
 	return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f2304a1054aa..c7a2d27120ba 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -325,15 +325,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
 	}
 
 	if (flags & S_ATIME)
-		inode->i_atime = fat_truncate_atime(sbi, now);
+		inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now));
 	/*
 	 * ctime and mtime share the same on-disk field, and should be
 	 * identical in memory. all mtime updates will be applied to ctime,
 	 * but ctime updates are ignored.
 	 */
 	if (flags & S_MTIME)
-		inode->i_mtime = inode_set_ctime_to_ts(inode,
-						       fat_truncate_mtime(sbi, now));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now)));
 
 	return 0;
 }
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 3626eb585a98..c52e63e10d35 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir)
 }
 
 const struct export_operations fat_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry   = fat_fh_to_dentry,
 	.fh_to_parent   = fat_fh_to_parent,
 	.get_parent     = fat_get_parent,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e871009f6c88..c80a6acad742 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown)
 }
 
 static DEFINE_SPINLOCK(fasync_lock);
-static struct kmem_cache *fasync_cache __read_mostly;
+static struct kmem_cache *fasync_cache __ro_after_init;
 
 static void fasync_free_rcu(struct rcu_head *head)
 {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6ea8d35a9382..18b3ba8dc8ea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path,
 	/*
 	 * We need to make sure whether the file system support decoding of
 	 * the file handle if decodeable file handle was requested.
-	 * Otherwise, even empty export_operations are sufficient to opt-in
-	 * to encoding FIDs.
 	 */
-	if (!path->dentry->d_sb->s_export_op ||
-	    (!(fh_flags & EXPORT_FH_FID) &&
-	     !path->dentry->d_sb->s_export_op->fh_to_dentry))
+	if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 
+	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+		return;
+
 	rcu_read_lock_sched();
 
 	if (unlikely(files->resize_in_progress)) {
@@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+	struct file __rcu *file;
+	struct file __rcu *file_reloaded;
+	struct file __rcu *file_reloaded_cmp;
+
+	file = rcu_dereference_raw(*f);
+	if (!file)
+		return NULL;
+
+	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		return ERR_PTR(-EAGAIN);
+
+	file_reloaded = rcu_dereference_raw(*f);
+
+	/*
+	 * Ensure that all accesses have a dependency on the load from
+	 * rcu_dereference_raw() above so we get correct ordering
+	 * between reuse/allocation and the pointer check below.
+	 */
+	file_reloaded_cmp = file_reloaded;
+	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+	/*
+	 * atomic_long_inc_not_zero() above provided a full memory
+	 * barrier when we acquired a reference.
+	 *
+	 * This is paired with the write barrier from assigning to the
+	 * __rcu protected file pointer so that if that pointer still
+	 * matches the current file, we know we have successfully
+	 * acquired a reference to the right file.
+	 *
+	 * If the pointers don't match the file has been reallocated by
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	if (file == file_reloaded_cmp)
+		return file_reloaded;
+
+	fput(file);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+	for (;;) {
+		struct file __rcu *file;
+
+		file = __get_file_rcu(f);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
+			continue;
+
+		return file;
+	}
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+	struct file __rcu *file;
+
+	rcu_read_lock();
+	file = __get_file_rcu(f);
+	rcu_read_unlock();
+	if (IS_ERR(file))
+		file = NULL;
+	return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
-	unsigned int fd, fmode_t mask)
+       unsigned int fd, fmode_t mask)
 {
 	for (;;) {
 		struct file *file;
@@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 			return NULL;
 
 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
-		file = rcu_dereference_raw(*fdentry);
-		if (unlikely(!file))
-			return NULL;
-
-		if (unlikely(file->f_mode & mask))
-			return NULL;
 
 		/*
 		 * Ok, we have a file pointer. However, because we do
@@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * Such a race can take two forms:
 		 *
-		 *  (a) the file ref already went down to zero,
-		 *      and get_file_rcu() fails. Just try again:
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
 		 */
-		if (unlikely(!get_file_rcu(file)))
+		file = __get_file_rcu(fdentry);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
 			continue;
 
 		/*
@@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
-		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
 
 		/*
+		 * This isn't the file we're looking for or we're not
+		 * allowed to get a reference to it.
+		 */
+		if (unlikely(file->f_mode & mask)) {
+			fput(file);
+			return NULL;
+		}
+
+		/*
 		 * Ok, we have a ref to the file, and checked that it
 		 * still exists.
 		 */
@@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+	return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	task_lock(task);
 	files = task->files;
 	if (files)
-		file = files_lookup_fd_rcu(files, fd);
+		file = __fget_files_rcu(files, fd, 0);
 	task_unlock(task);
 
 	return file;
 }
 
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	files = task->files;
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
-			file = files_lookup_fd_rcu(files, fd);
+			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
@@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		struct file *f;
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!files_lookup_fd_rcu(files, oldfd))
+		f = __fget_files_rcu(files, oldfd, 0);
+		if (!f)
 			retval = -EBADF;
 		rcu_read_unlock();
+		if (f)
+			fput(f);
 		return retval;
 	}
 	return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..de4a2915bfd4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -40,14 +40,14 @@ static struct files_stat_struct files_stat = {
 };
 
 /* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+static struct kmem_cache *filp_cachep __ro_after_init;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
 struct backing_file {
 	struct file file;
-	struct path real_path;
+	struct path user_path;
 };
 
 static inline struct backing_file *backing_file(struct file *f)
@@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f)
 	return container_of(f, struct backing_file, file);
 }
 
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
 {
-	return &backing_file(f)->real_path;
+	return &backing_file(f)->user_path;
 }
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
 
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
 {
-	struct file *f = container_of(head, struct file, f_rcuhead);
-
+	security_file_free(f);
+	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+		percpu_counter_dec(&nr_files);
 	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_user_path(f));
 		kfree(backing_file(f));
-	else
+	} else {
 		kmem_cache_free(filp_cachep, f);
+	}
 }
 
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
 {
-	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
-	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
-		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+	if (atomic_long_dec_and_test(&f->f_count)) {
+		security_file_free(f);
+		put_cred(f->f_cred);
+		if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+			percpu_counter_dec(&nr_files);
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 /*
@@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
 
+	/*
+	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+	 * fget-rcu pattern users need to be able to handle spurious
+	 * refcount bumps we should reinitialize the reused file first.
+	 */
+	atomic_long_set(&f->f_count, 1);
 	return 0;
 }
 
@@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ac5d43b164b5..20600e9ea202 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -109,11 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
 	set_nlink(inode, vip->vii_nlink);
 	inode->i_size = vip->vii_size;
 
-	inode->i_atime.tv_sec = vip->vii_atime;
+	inode_set_atime(inode, vip->vii_atime, 0);
 	inode_set_ctime(inode, vip->vii_ctime, 0);
-	inode->i_mtime.tv_sec = vip->vii_mtime;
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
+	inode_set_mtime(inode, vip->vii_mtime, 0);
 
 	inode->i_blocks = vip->vii_blocks;
 	inode->i_generation = vip->vii_gen;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 310d73e254df..e6e2a2185e7c 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -76,6 +76,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
 	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 	struct vxfs_sb *raw_sb = infp->vsi_raw;
+	u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
 	bufp->f_bsize = dentry->d_sb->s_blocksize;
@@ -84,6 +85,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 	bufp->f_bavail = 0;
 	bufp->f_files = 0;
 	bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree);
+	bufp->f_fsid = u64_to_fsid(id);
 	bufp->f_namelen = VXFS_NAMELEN;
 
 	return 0;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c1af01b2c42d..1767493dffda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
 	kfree(isw);
 }
 
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+				   struct list_head *list, int *nr)
+{
+	struct inode *inode;
+
+	list_for_each_entry(inode, list, i_io_list) {
+		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+			continue;
+
+		isw->inodes[*nr] = inode;
+		(*nr)++;
+
+		if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+			return true;
+	}
+	return false;
+}
+
 /**
  * cleanup_offline_cgwb - detach associated inodes
  * @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 {
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
-	struct inode *inode;
 	int nr;
 	bool restart = false;
 
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 
 	nr = 0;
 	spin_lock(&wb->list_lock);
-	list_for_each_entry(inode, &wb->b_attached, i_io_list) {
-		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
-			continue;
-
-		isw->inodes[nr++] = inode;
-
-		if (nr >= WB_MAX_INODES_PER_ISW - 1) {
-			restart = true;
-			break;
-		}
-	}
+	/*
+	 * In addition to the inodes that have completed writeback, also switch
+	 * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+	 * inodes won't be written back for a long time when lazytime is
+	 * enabled, and thus pinning the dying cgwbs. It won't break the
+	 * bandwidth restrictions, as writeback of inode metadata is not
+	 * accounted for.
+	 */
+	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+	if (!restart)
+		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
 	spin_unlock(&wb->list_lock);
 
 	/* no attached inodes? bail out */
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ce03f6521c88..6593ae518115 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -465,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig,
 		param.file = fget(aux);
 		if (!param.file)
 			goto out_key;
+		param.dirfd = aux;
 		break;
 	default:
 		break;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index ab62e4624256..284a35006462 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 	inode->i_mode = mode;
 	inode->i_uid = fc->user_id;
 	inode->i_gid = fc->group_id;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	/* setting ->i_op to NULL is not allowed */
 	if (iop)
 		inode->i_op = iop;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 23904a6a9a96..12ef91d170bb 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1222,6 +1222,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc)
 	if (fc->dax) {
 		fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
 		kfree(fc->dax);
+		fc->dax = NULL;
 	}
 }
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d707e6987da9..d19cbf34c634 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1812,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
 	memset(&outarg, 0, sizeof(outarg));
 
 	inarg.valid = FATTR_MTIME;
-	inarg.mtime = inode->i_mtime.tv_sec;
-	inarg.mtimensec = inode->i_mtime.tv_nsec;
+	inarg.mtime = inode_get_mtime_sec(inode);
+	inarg.mtimensec = inode_get_mtime_nsec(inode);
 	if (fm->fc->minor >= 23) {
 		inarg.valid |= FATTR_CTIME;
-		inarg.ctime = inode_get_ctime(inode).tv_sec;
-		inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
+		inarg.ctime = inode_get_ctime_sec(inode);
+		inarg.ctimensec = inode_get_ctime_nsec(inode);
 	}
 	if (ff) {
 		inarg.valid |= FATTR_FH;
@@ -1956,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	/* the kernel maintains i_mtime locally */
 	if (trust_local_cmtime) {
 		if (attr->ia_valid & ATTR_MTIME)
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 		if (attr->ia_valid & ATTR_CTIME)
 			inode_set_ctime_to_ts(inode, attr->ia_ctime);
 		/* FIXME: clear I_DIRTY_SYNC? */
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1cdb6327511e..a660f1f21540 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1448,7 +1448,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 	if (!ia)
 		return -ENOMEM;
 
-	if (fopen_direct_io && fc->direct_io_relax) {
+	if (fopen_direct_io && fc->direct_io_allow_mmap) {
 		res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
 		if (res) {
 			fuse_io_free(ia);
@@ -1574,6 +1574,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t res;
 	bool exclusive_lock =
 		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+		get_fuse_conn(inode)->direct_io_allow_mmap ||
 		iocb->ki_flags & IOCB_APPEND ||
 		fuse_direct_write_extending_i_size(iocb, from);
 
@@ -1581,6 +1582,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	 * Take exclusive lock if
 	 * - Parallel direct writes are disabled - a user space decision
 	 * - Parallel direct writes are enabled and i_size is being extended.
+	 * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
 	 *   This might not be needed at all, but needs further investigation.
 	 */
 	if (exclusive_lock)
@@ -2466,9 +2468,9 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (ff->open_flags & FOPEN_DIRECT_IO) {
 		/* Can't provide the coherency needed for MAP_SHARED
-		 * if FUSE_DIRECT_IO_RELAX isn't set.
+		 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
 		 */
-		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax)
+		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
 			return -ENODEV;
 
 		invalidate_inode_pages2(file->f_mapping);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..1df83eebda92 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -63,6 +63,19 @@ struct fuse_forget_link {
 	struct fuse_forget_link *next;
 };
 
+/* Submount lookup tracking */
+struct fuse_submount_lookup {
+	/** Refcount */
+	refcount_t count;
+
+	/** Unique ID, which identifies the inode between userspace
+	 * and kernel */
+	u64 nodeid;
+
+	/** The request used for sending the FORGET message */
+	struct fuse_forget_link *forget;
+};
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -158,6 +171,8 @@ struct fuse_inode {
 	 */
 	struct fuse_inode_dax *dax;
 #endif
+	/** Submount specific lookup tracking */
+	struct fuse_submount_lookup *submount_lookup;
 };
 
 /** FUSE inode state bits */
@@ -797,8 +812,8 @@ struct fuse_conn {
 	/* Is tmpfile not implemented by fs? */
 	unsigned int no_tmpfile:1;
 
-	/* relax restrictions in FOPEN_DIRECT_IO mode */
-	unsigned int direct_io_relax:1;
+	/* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */
+	unsigned int direct_io_allow_mmap:1;
 
 	/* Is statx not implemented by fs? */
 	unsigned int no_statx:1;
@@ -1284,7 +1299,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size);
 ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
 int fuse_removexattr(struct inode *inode, const char *name);
-extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler * const fuse_xattr_handlers[];
 
 struct posix_acl;
 struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..2a6d44f91729 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -68,6 +68,24 @@ struct fuse_forget_link *fuse_alloc_forget(void)
 	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
 }
 
+static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void)
+{
+	struct fuse_submount_lookup *sl;
+
+	sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT);
+	if (!sl)
+		return NULL;
+	sl->forget = fuse_alloc_forget();
+	if (!sl->forget)
+		goto out_free;
+
+	return sl;
+
+out_free:
+	kfree(sl);
+	return NULL;
+}
+
 static struct inode *fuse_alloc_inode(struct super_block *sb)
 {
 	struct fuse_inode *fi;
@@ -83,6 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->attr_version = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->submount_lookup = NULL;
 	mutex_init(&fi->mutex);
 	spin_lock_init(&fi->lock);
 	fi->forget = fuse_alloc_forget();
@@ -113,6 +132,17 @@ static void fuse_free_inode(struct inode *inode)
 	kmem_cache_free(fuse_inode_cachep, fi);
 }
 
+static void fuse_cleanup_submount_lookup(struct fuse_conn *fc,
+					 struct fuse_submount_lookup *sl)
+{
+	if (!refcount_dec_and_test(&sl->count))
+		return;
+
+	fuse_queue_forget(fc, sl->forget, sl->nodeid, 1);
+	sl->forget = NULL;
+	kfree(sl);
+}
+
 static void fuse_evict_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
@@ -132,6 +162,11 @@ static void fuse_evict_inode(struct inode *inode)
 					  fi->nlookup);
 			fi->forget = NULL;
 		}
+
+		if (fi->submount_lookup) {
+			fuse_cleanup_submount_lookup(fc, fi->submount_lookup);
+			fi->submount_lookup = NULL;
+		}
 	}
 	if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
 		WARN_ON(!list_empty(&fi->write_files));
@@ -188,12 +223,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
 	attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
 
-	inode->i_atime.tv_sec   = attr->atime;
-	inode->i_atime.tv_nsec  = attr->atimensec;
+	inode_set_atime(inode, attr->atime, attr->atimensec);
 	/* mtime from server may be stale due to local buffered write */
 	if (!(cache_mask & STATX_MTIME)) {
-		inode->i_mtime.tv_sec   = attr->mtime;
-		inode->i_mtime.tv_nsec  = attr->mtimensec;
+		inode_set_mtime(inode, attr->mtime, attr->mtimensec);
 	}
 	if (!(cache_mask & STATX_CTIME)) {
 		inode_set_ctime(inode, attr->ctime, attr->ctimensec);
@@ -276,12 +309,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		attr->size = i_size_read(inode);
 
 	if (cache_mask & STATX_MTIME) {
-		attr->mtime = inode->i_mtime.tv_sec;
-		attr->mtimensec = inode->i_mtime.tv_nsec;
+		attr->mtime = inode_get_mtime_sec(inode);
+		attr->mtimensec = inode_get_mtime_nsec(inode);
 	}
 	if (cache_mask & STATX_CTIME) {
-		attr->ctime = inode_get_ctime(inode).tv_sec;
-		attr->ctimensec = inode_get_ctime(inode).tv_nsec;
+		attr->ctime = inode_get_ctime_sec(inode);
+		attr->ctimensec = inode_get_ctime_nsec(inode);
 	}
 
 	if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -290,7 +323,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		return;
 	}
 
-	old_mtime = inode->i_mtime;
+	old_mtime = inode_get_mtime(inode);
 	fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
 
 	oldsize = inode->i_size;
@@ -332,13 +365,19 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		fuse_dax_dontcache(inode, attr->flags);
 }
 
+static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl,
+				      u64 nodeid)
+{
+	sl->nodeid = nodeid;
+	refcount_set(&sl->count, 1);
+}
+
 static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
 			    struct fuse_conn *fc)
 {
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
-	inode->i_mtime.tv_sec  = attr->mtime;
-	inode->i_mtime.tv_nsec = attr->mtimensec;
+	inode_set_mtime(inode, attr->mtime, attr->mtimensec);
 	inode_set_ctime(inode, attr->ctime, attr->ctimensec);
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
@@ -395,12 +434,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 	 */
 	if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) &&
 	    S_ISDIR(attr->mode)) {
+		struct fuse_inode *fi;
+
 		inode = new_inode(sb);
 		if (!inode)
 			return NULL;
 
 		fuse_init_inode(inode, attr, fc);
-		get_fuse_inode(inode)->nodeid = nodeid;
+		fi = get_fuse_inode(inode);
+		fi->nodeid = nodeid;
+		fi->submount_lookup = fuse_alloc_submount_lookup();
+		if (!fi->submount_lookup) {
+			iput(inode);
+			return NULL;
+		}
+		/* Sets nlookup = 1 on fi->submount_lookup->nlookup */
+		fuse_init_submount_lookup(fi->submount_lookup, nodeid);
 		inode->i_flags |= S_AUTOMOUNT;
 		goto done;
 	}
@@ -423,11 +472,11 @@ retry:
 		iput(inode);
 		goto retry;
 	}
-done:
 	fi = get_fuse_inode(inode);
 	spin_lock(&fi->lock);
 	fi->nlookup++;
 	spin_unlock(&fi->lock);
+done:
 	fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version);
 
 	return inode;
@@ -1002,7 +1051,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	}
 
 	*max_len = len;
-	return parent ? 0x82 : 0x81;
+	return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
 }
 
 static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
@@ -1010,7 +1059,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
 {
 	struct fuse_inode_handle handle;
 
-	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+	if ((fh_type != FILEID_INO64_GEN &&
+	     fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
 		return NULL;
 
 	handle.nodeid = (u64) fid->raw[0] << 32;
@@ -1024,7 +1074,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
 {
 	struct fuse_inode_handle parent;
 
-	if (fh_type != 0x82 || fh_len < 6)
+	if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
 		return NULL;
 
 	parent.nodeid = (u64) fid->raw[3] << 32;
@@ -1232,8 +1282,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				fc->init_security = 1;
 			if (flags & FUSE_CREATE_SUPP_GROUP)
 				fc->create_supp_group = 1;
-			if (flags & FUSE_DIRECT_IO_RELAX)
-				fc->direct_io_relax = 1;
+			if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
+				fc->direct_io_allow_mmap = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1280,7 +1330,7 @@ void fuse_send_init(struct fuse_mount *fm)
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
 		FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
 		FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
-		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX;
+		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
 #ifdef CONFIG_FUSE_DAX
 	if (fm->fc->dax)
 		flags |= FUSE_MAP_ALIGNMENT;
@@ -1423,17 +1473,19 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
 static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
 				      const struct fuse_inode *fi)
 {
+	struct timespec64 atime = inode_get_atime(&fi->inode);
+	struct timespec64 mtime = inode_get_mtime(&fi->inode);
 	struct timespec64 ctime = inode_get_ctime(&fi->inode);
 
 	*attr = (struct fuse_attr){
 		.ino		= fi->inode.i_ino,
 		.size		= fi->inode.i_size,
 		.blocks		= fi->inode.i_blocks,
-		.atime		= fi->inode.i_atime.tv_sec,
-		.mtime		= fi->inode.i_mtime.tv_sec,
+		.atime		= atime.tv_sec,
+		.mtime		= mtime.tv_sec,
 		.ctime		= ctime.tv_sec,
-		.atimensec	= fi->inode.i_atime.tv_nsec,
-		.mtimensec	= fi->inode.i_mtime.tv_nsec,
+		.atimensec	= atime.tv_nsec,
+		.mtimensec	= mtime.tv_nsec,
 		.ctimensec	= ctime.tv_nsec,
 		.mode		= fi->inode.i_mode,
 		.nlink		= fi->inode.i_nlink,
@@ -1465,6 +1517,8 @@ static int fuse_fill_super_submount(struct super_block *sb,
 	struct super_block *parent_sb = parent_fi->inode.i_sb;
 	struct fuse_attr root_attr;
 	struct inode *root;
+	struct fuse_submount_lookup *sl;
+	struct fuse_inode *fi;
 
 	fuse_sb_defaults(sb);
 	fm->sb = sb;
@@ -1487,12 +1541,27 @@ static int fuse_fill_super_submount(struct super_block *sb,
 	 * its nlookup should not be incremented.  fuse_iget() does
 	 * that, though, so undo it here.
 	 */
-	get_fuse_inode(root)->nlookup--;
+	fi = get_fuse_inode(root);
+	fi->nlookup--;
+
 	sb->s_d_op = &fuse_dentry_operations;
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
 		return -ENOMEM;
 
+	/*
+	 * Grab the parent's submount_lookup pointer and take a
+	 * reference on the shared nlookup from the parent.  This is to
+	 * prevent the last forget for this nodeid from getting
+	 * triggered until all users have finished with it.
+	 */
+	sl = parent_fi->submount_lookup;
+	WARN_ON(!sl);
+	if (sl) {
+		refcount_inc(&sl->count);
+		fi->submount_lookup = sl;
+	}
+
 	return 0;
 }
 
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 9e6d587b3e67..c66a54d6c7d3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -476,7 +476,7 @@ retry_locked:
 	if (!fi->rdc.cached) {
 		/* Starting cache? Set cache mtime. */
 		if (!ctx->pos && !fi->rdc.size) {
-			fi->rdc.mtime = inode->i_mtime;
+			fi->rdc.mtime = inode_get_mtime(inode);
 			fi->rdc.iversion = inode_query_iversion(inode);
 		}
 		spin_unlock(&fi->rdc.lock);
@@ -488,8 +488,10 @@ retry_locked:
 	 * changed, and reset the cache if so.
 	 */
 	if (!ctx->pos) {
+		struct timespec64 mtime = inode_get_mtime(inode);
+
 		if (inode_peek_iversion(inode) != fi->rdc.iversion ||
-		    !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+		    !timespec64_equal(&fi->rdc.mtime, &mtime)) {
 			fuse_rdc_reset(inode);
 			goto retry_locked;
 		}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 49c01559580f..5b423fdbb13f 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = {
 	.set    = fuse_xattr_set,
 };
 
-const struct xattr_handler *fuse_xattr_handlers[] = {
+const struct xattr_handler * const fuse_xattr_handlers[] = {
 	&fuse_xattr_handler,
 	NULL
 };
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index d4deb2b19959..82f5b09c04e6 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -11,9 +11,9 @@
 
 #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
 
-extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
-extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
-			struct posix_acl *acl, int type);
+struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
+int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+		 struct posix_acl *acl, int type);
 
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index c26d48355cc2..9611bfceda4b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
 	if (folio_test_checked(folio)) {
 		folio_clear_checked(folio);
 		if (!folio_buffers(folio)) {
-			folio_create_empty_buffers(folio,
+			create_empty_buffers(folio,
 					inode->i_sb->s_blocksize,
 					BIT(BH_Dirty)|BIT(BH_Uptodate));
 		}
@@ -155,7 +155,7 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+	if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE))
 		goto out;
 	if (folio_test_checked(folio) || current->journal_info)
 		goto out_ignore;
@@ -214,12 +214,12 @@ static int gfs2_write_jdata_batch(struct address_space *mapping,
 	unsigned nrblocks;
 	int i;
 	int ret;
-	int nr_pages = 0;
+	size_t size = 0;
 	int nr_folios = folio_batch_count(fbatch);
 
 	for (i = 0; i < nr_folios; i++)
-		nr_pages += folio_nr_pages(fbatch->folios[i]);
-	nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits);
+		size += folio_size(fbatch->folios[i]);
+	nrblocks = size >> inode->i_blkbits;
 
 	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
 	if (ret < 0)
@@ -403,27 +403,27 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 }
 
 /**
- * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * stuffed_readpage - Fill in a Linux folio with stuffed file data
  * @ip: the inode
- * @page: the page
+ * @folio: the folio
  *
  * Returns: errno
  */
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
 {
 	struct buffer_head *dibh;
-	u64 dsize = i_size_read(&ip->i_inode);
-	void *kaddr;
+	size_t i_size = i_size_read(&ip->i_inode);
+	void *data;
 	int error;
 
 	/*
 	 * Due to the order of unstuffing files and ->fault(), we can be
-	 * asked for a zero page in the case of a stuffed file being extended,
+	 * asked for a zero folio in the case of a stuffed file being extended,
 	 * so we need to supply one here. It doesn't happen often.
 	 */
-	if (unlikely(page->index)) {
-		zero_user(page, 0, PAGE_SIZE);
-		SetPageUptodate(page);
+	if (unlikely(folio->index)) {
+		folio_zero_range(folio, 0, folio_size(folio));
+		folio_mark_uptodate(folio);
 		return 0;
 	}
 
@@ -431,13 +431,11 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		return error;
 
-	kaddr = kmap_local_page(page);
-	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-	memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
-	kunmap_local(kaddr);
-	flush_dcache_page(page);
+	data = dibh->b_data + sizeof(struct gfs2_dinode);
+	memcpy_to_folio(folio, 0, data, i_size);
+	folio_zero_range(folio, i_size, folio_size(folio) - i_size);
 	brelse(dibh);
-	SetPageUptodate(page);
+	folio_mark_uptodate(folio);
 
 	return 0;
 }
@@ -458,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
 	    (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
 		error = iomap_read_folio(folio, &gfs2_iomap_ops);
 	} else if (gfs2_is_stuffed(ip)) {
-		error = stuffed_readpage(ip, &folio->page);
+		error = stuffed_readpage(ip, folio);
 		folio_unlock(folio);
 	} else {
 		error = mpage_read_folio(folio, gfs2_block_map);
@@ -479,31 +477,29 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
  *
  */
 
-int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
-                       unsigned size)
+ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
+			   size_t size)
 {
 	struct address_space *mapping = ip->i_inode.i_mapping;
 	unsigned long index = *pos >> PAGE_SHIFT;
-	unsigned offset = *pos & (PAGE_SIZE - 1);
-	unsigned copied = 0;
-	unsigned amt;
-	struct page *page;
+	size_t copied = 0;
 
 	do {
-		page = read_cache_page(mapping, index, gfs2_read_folio, NULL);
-		if (IS_ERR(page)) {
-			if (PTR_ERR(page) == -EINTR)
+		size_t offset, chunk;
+		struct folio *folio;
+
+		folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL);
+		if (IS_ERR(folio)) {
+			if (PTR_ERR(folio) == -EINTR)
 				continue;
-			return PTR_ERR(page);
+			return PTR_ERR(folio);
 		}
-		amt = size - copied;
-		if (offset + size > PAGE_SIZE)
-			amt = PAGE_SIZE - offset;
-		memcpy_from_page(buf + copied, page, offset, amt);
-		put_page(page);
-		copied += amt;
-		index++;
-		offset = 0;
+		offset = *pos + copied - folio_pos(folio);
+		chunk = min(size - copied, folio_size(folio) - offset);
+		memcpy_from_folio(buf + copied, folio, offset, chunk);
+		index = folio_next_index(folio);
+		folio_put(folio);
+		copied += chunk;
 	} while(copied < size);
 	(*pos) += size;
 	return size;
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index f08322ef41cf..a10c4334d248 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@@ -8,8 +8,8 @@
 
 #include "incore.h"
 
-extern void adjust_fs_space(struct inode *inode);
-extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
-				    size_t from, size_t len);
+void adjust_fs_space(struct inode *inode);
+void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
+			     size_t from, size_t len);
 
 #endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ef7017fb6951..d9ccfd27e4f1 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -43,53 +43,51 @@ struct metapath {
 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
 
 /**
- * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio
  * @ip: the inode
  * @dibh: the dinode buffer
  * @block: the block number that was allocated
- * @page: The (optional) page. This is looked up if @page is NULL
+ * @folio: The folio.
  *
  * Returns: errno
  */
-
-static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
-			       u64 block, struct page *page)
+static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh,
+			       u64 block, struct folio *folio)
 {
 	struct inode *inode = &ip->i_inode;
 
-	if (!PageUptodate(page)) {
-		void *kaddr = kmap(page);
+	if (!folio_test_uptodate(folio)) {
+		void *kaddr = kmap_local_folio(folio, 0);
 		u64 dsize = i_size_read(inode);
  
 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
-		kunmap(page);
+		memset(kaddr + dsize, 0, folio_size(folio) - dsize);
+		kunmap_local(kaddr);
 
-		SetPageUptodate(page);
+		folio_mark_uptodate(folio);
 	}
 
 	if (gfs2_is_jdata(ip)) {
-		struct buffer_head *bh;
+		struct buffer_head *bh = folio_buffers(folio);
 
-		if (!page_has_buffers(page))
-			create_empty_buffers(page, BIT(inode->i_blkbits),
-					     BIT(BH_Uptodate));
+		if (!bh)
+			bh = create_empty_buffers(folio,
+				BIT(inode->i_blkbits), BIT(BH_Uptodate));
 
-		bh = page_buffers(page);
 		if (!buffer_mapped(bh))
 			map_bh(bh, inode->i_sb, block);
 
 		set_buffer_uptodate(bh);
 		gfs2_trans_add_data(ip->i_gl, bh);
 	} else {
-		set_page_dirty(page);
+		folio_mark_dirty(folio);
 		gfs2_ordered_add_inode(ip);
 	}
 
 	return 0;
 }
 
-static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
+static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio)
 {
 	struct buffer_head *bh, *dibh;
 	struct gfs2_dinode *di;
@@ -106,7 +104,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
 		   and write it out to disk */
 
 		unsigned int n = 1;
-		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+		error = gfs2_alloc_blocks(ip, &block, &n, 0);
 		if (error)
 			goto out_brelse;
 		if (isdir) {
@@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
 					      dibh, sizeof(struct gfs2_dinode));
 			brelse(bh);
 		} else {
-			error = gfs2_unstuffer_page(ip, dibh, block, page);
+			error = gfs2_unstuffer_folio(ip, dibh, block, folio);
 			if (error)
 				goto out_brelse;
 		}
@@ -157,17 +155,17 @@ out_brelse:
 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 {
 	struct inode *inode = &ip->i_inode;
-	struct page *page;
+	struct folio *folio;
 	int error;
 
 	down_write(&ip->i_rw_mutex);
-	page = grab_cache_page(inode->i_mapping, 0);
-	error = -ENOMEM;
-	if (!page)
+	folio = filemap_grab_folio(inode->i_mapping, 0);
+	error = PTR_ERR(folio);
+	if (IS_ERR(folio))
 		goto out;
-	error = __gfs2_unstuff_inode(ip, page);
-	unlock_page(page);
-	put_page(page);
+	error = __gfs2_unstuff_inode(ip, folio);
+	folio_unlock(folio);
+	folio_put(folio);
 out:
 	up_write(&ip->i_rw_mutex);
 	return error;
@@ -317,6 +315,12 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 	}
 }
 
+static inline struct buffer_head *
+metapath_dibh(struct metapath *mp)
+{
+	return mp->mp_bh[0];
+}
+
 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 			     unsigned int x, unsigned int h)
 {
@@ -415,13 +419,12 @@ static void release_metapath(struct metapath *mp)
  * gfs2_extent_length - Returns length of an extent of blocks
  * @bh: The metadata block
  * @ptr: Current position in @bh
- * @limit: Max extent length to return
  * @eob: Set to 1 if we hit "end of block"
  *
  * Returns: The length of the extent (minimum of one block)
  */
 
-static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
+static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob)
 {
 	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 	const __be64 *first = ptr;
@@ -660,7 +663,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *dibh = mp->mp_bh[0];
+	struct buffer_head *dibh = metapath_dibh(mp);
 	u64 bn;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	size_t dblks = iomap->length >> inode->i_blkbits;
@@ -702,7 +705,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 	i = mp->mp_aheight;
 	do {
 		n = blks - alloced;
-		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+		ret = gfs2_alloc_blocks(ip, &bn, &n, 0);
 		if (ret)
 			goto out;
 		alloced += n;
@@ -913,7 +916,7 @@ unstuff:
 		goto do_alloc;
 
 	bh = mp->mp_bh[ip->i_height - 1];
-	len = gfs2_extent_length(bh, ptr, len, &eob);
+	len = gfs2_extent_length(bh, ptr, &eob);
 
 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 	iomap->length = len << inode->i_blkbits;
@@ -1386,7 +1389,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 
 	i_size_write(inode, newsize);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_dinode_out(ip, dibh->b_data);
 
 	if (journaled)
@@ -1583,7 +1586,7 @@ out_unlock:
 
 			/* Every transaction boundary, we rewrite the dinode
 			   to keep its di_blocks current in case of failure. */
-			ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+			inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 			gfs2_trans_add_meta(ip->i_gl, dibh);
 			gfs2_dinode_out(ip, dibh->b_data);
 			brelse(dibh);
@@ -1949,7 +1952,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 		gfs2_statfs_change(sdp, 0, +btotal, 0);
 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
 				  ip->i_inode.i_gid);
-		ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+		inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		up_write(&ip->i_rw_mutex);
@@ -1992,7 +1995,7 @@ static int trunc_end(struct gfs2_inode *ip)
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 		gfs2_ordered_del_inode(ip);
 	}
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2093,7 +2096,7 @@ static int do_grow(struct inode *inode, u64 size)
 		goto do_end_trans;
 
 	truncate_setsize(inode, size);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index e5b7d17131ed..4e8b1e8ebdf3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,24 +46,24 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 extern const struct iomap_ops gfs2_iomap_ops;
 extern const struct iomap_writeback_ops gfs2_writeback_ops;
 
-extern int gfs2_unstuff_dinode(struct gfs2_inode *ip);
-extern int gfs2_block_map(struct inode *inode, sector_t lblock,
-			  struct buffer_head *bh, int create);
-extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
-			  struct iomap *iomap);
-extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
-			    struct iomap *iomap);
-extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
-			   unsigned int *extlen);
-extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
-			     unsigned *extlen, bool *new);
-extern int gfs2_setattr_size(struct inode *inode, u64 size);
-extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
-extern int gfs2_file_dealloc(struct gfs2_inode *ip);
-extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-				     unsigned int len);
-extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
-extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
-extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
+int gfs2_unstuff_dinode(struct gfs2_inode *ip);
+int gfs2_block_map(struct inode *inode, sector_t lblock,
+		   struct buffer_head *bh, int create);
+int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
+		   struct iomap *iomap);
+int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
+		     struct iomap *iomap);
+int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
+		    unsigned int *extlen);
+int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
+		      unsigned *extlen, bool *new);
+int gfs2_setattr_size(struct inode *inode, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+			      unsigned int len);
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
+int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length);
 
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a2afa88f8be..560e4624c09f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
 	if (ip->i_inode.i_size < offset + size)
 		i_size_write(&ip->i_inode, offset + size);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 	gfs2_dinode_out(ip, dibh->b_data);
 
 	brelse(dibh);
@@ -227,7 +227,7 @@ out:
 
 	if (ip->i_inode.i_size < offset + copied)
 		i_size_write(&ip->i_inode, offset + copied);
-	ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+	inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -868,7 +868,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	struct gfs2_dirent *dent;
 	struct timespec64 tv = current_time(inode);
 
-	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
+	error = gfs2_alloc_blocks(ip, &bn, &n, 0);
 	if (error)
 		return NULL;
 	bh = gfs2_meta_new(ip->i_gl, bn);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			da->bh = NULL;
 			brelse(bh);
 			ip->i_entries++;
-			ip->i_inode.i_mtime = tv;
+			inode_set_mtime_to_ts(&ip->i_inode, tv);
 			if (S_ISDIR(nip->i_inode.i_mode))
 				inc_nlink(&ip->i_inode);
 			mark_inode_dirty(inode);
@@ -1911,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	dip->i_entries--;
-	dip->i_inode.i_mtime =  tv;
+	inode_set_mtime_to_ts(&dip->i_inode, tv);
 	if (d_is_dir(dentry))
 		drop_nlink(&dip->i_inode);
 	mark_inode_dirty(&dip->i_inode);
@@ -1952,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	dent->de_type = cpu_to_be16(new_type);
 	brelse(bh);
 
-	dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
+	inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode));
 	mark_inode_dirty_sync(&dip->i_inode);
 	return 0;
 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 5b76480c17c9..25a857c78b53 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,32 +23,32 @@ struct gfs2_diradd {
 	int save_loc;
 };
 
-extern struct inode *gfs2_dir_search(struct inode *dir,
-				     const struct qstr *filename,
-				     bool fail_on_exist);
-extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-			  const struct gfs2_inode *ip);
-extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-			const struct gfs2_inode *ip, struct gfs2_diradd *da);
+struct inode *gfs2_dir_search(struct inode *dir,
+			      const struct qstr *filename,
+			      bool fail_on_exist);
+int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+		   const struct gfs2_inode *ip);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+		 const struct gfs2_inode *ip, struct gfs2_diradd *da);
 static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
 {
 	brelse(da->bh);
 	da->bh = NULL;
 }
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
-			 struct file_ra_state *f_ra);
-extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-			  const struct gfs2_inode *nip, unsigned int new_type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+		  struct file_ra_state *f_ra);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+		   const struct gfs2_inode *nip, unsigned int new_type);
 
-extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
-extern int gfs2_diradd_alloc_required(struct inode *dir,
-				      const struct qstr *filename,
-				      struct gfs2_diradd *da);
-extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-				   struct buffer_head **bhp);
-extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
+int gfs2_diradd_alloc_required(struct inode *dir,
+			       const struct qstr *filename,
+			       struct gfs2_diradd *da);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+			    struct buffer_head **bhp);
+void gfs2_dir_hash_inval(struct gfs2_inode *ip);
 
 static inline u32 gfs2_disk_hash(const char *data, int len)
 {
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f2700477a300..4b66efc1a82a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	struct gfs2_alloc_parms ap = {};
 	u64 offset = page_offset(page);
 	unsigned int data_blocks, ind_blocks, rblocks;
 	vm_fault_t ret = VM_FAULT_LOCKED;
@@ -1120,14 +1120,16 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out_unlock;
 
-	ret = file_update_time(file);
-	if (ret)
-		goto out_unlock;
-
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		struct address_space *mapping = file->f_mapping;
 		ssize_t buffered, ret2;
 
+		/*
+		 * Note that under direct I/O, we don't allow and inode
+		 * timestamp updates, so we're not calling file_update_time()
+		 * here.
+		 */
+
 		ret = gfs2_file_direct_write(iocb, from, &gh);
 		if (ret < 0 || !iov_iter_count(from))
 			goto out_unlock;
@@ -1154,6 +1156,10 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (!ret || ret2 > 0)
 			ret += ret2;
 	} else {
+		ret = file_update_time(file);
+		if (ret)
+			goto out_unlock;
+
 		ret = gfs2_file_buffered_write(iocb, from, &gh);
 		if (likely(ret > 0))
 			ret = generic_write_sync(iocb, ret);
@@ -1245,7 +1251,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
 	struct inode *inode = file_inode(file);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	struct gfs2_alloc_parms ap = {};
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	loff_t bytes, max_bytes, max_blks;
 	int error;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4a280be229a6..d6bf1f8c25dc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1524,7 +1524,6 @@ fail:
 		return;
 	}
 	list_add_tail(&gh->gh_list, insert_pt);
-	gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
 	spin_unlock(&gl->gl_lockref.lock);
 	if (sdp->sd_lockstruct.ls_ops->lm_cancel)
 		sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
@@ -2041,11 +2040,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
 	return vfs_pressure_ratio(atomic_read(&lru_count));
 }
 
-static struct shrinker glock_shrinker = {
-	.seeks = DEFAULT_SEEKS,
-	.count_objects = gfs2_glock_shrink_count,
-	.scan_objects = gfs2_glock_shrink_scan,
-};
+static struct shrinker *glock_shrinker;
 
 /**
  * glock_hash_walk - Call a function for glock in a hash bucket
@@ -2465,13 +2460,18 @@ int __init gfs2_glock_init(void)
 		return -ENOMEM;
 	}
 
-	ret = register_shrinker(&glock_shrinker, "gfs2-glock");
-	if (ret) {
+	glock_shrinker = shrinker_alloc(0, "gfs2-glock");
+	if (!glock_shrinker) {
 		destroy_workqueue(glock_workqueue);
 		rhashtable_destroy(&gl_hash_table);
-		return ret;
+		return -ENOMEM;
 	}
 
+	glock_shrinker->count_objects = gfs2_glock_shrink_count;
+	glock_shrinker->scan_objects = gfs2_glock_shrink_scan;
+
+	shrinker_register(glock_shrinker);
+
 	for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
 		init_waitqueue_head(glock_wait_table + i);
 
@@ -2480,7 +2480,7 @@ int __init gfs2_glock_init(void)
 
 void gfs2_glock_exit(void)
 {
-	unregister_shrinker(&glock_shrinker);
+	shrinker_free(glock_shrinker);
 	rhashtable_destroy(&gl_hash_table);
 	destroy_workqueue(glock_workqueue);
 }
@@ -2719,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 	for(;; i->fd++) {
 		struct inode *inode;
 
-		i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
+
 		inode = file_inode(i->file);
-		if (inode->i_sb != i->sb)
-			continue;
-		if (get_file_rcu(i->file))
+		if (inode->i_sb == i->sb)
 			break;
+
+		rcu_read_unlock();
+		fput(i->file);
+		rcu_read_lock();
 	}
 	rcu_read_unlock();
 	return i->file;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c8685ca7d2a2..61197598abfd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -156,21 +156,6 @@ out:
 	return gh;
 }
 
-static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
-{
-	return gl->gl_state == LM_ST_EXCLUSIVE;
-}
-
-static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
-{
-	return gl->gl_state == LM_ST_DEFERRED;
-}
-
-static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
-{
-	return gl->gl_state == LM_ST_SHARED;
-}
-
 static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 {
 	if (gl->gl_ops->go_flags & GLOF_ASPACE) {
@@ -181,40 +166,40 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
 	return NULL;
 }
 
-extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
-			  const struct gfs2_glock_operations *glops,
-			  int create, struct gfs2_glock **glp);
-extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put(struct gfs2_glock *gl);
-extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+		   const struct gfs2_glock_operations *glops,
+		   int create, struct gfs2_glock **glp);
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_queue_put(struct gfs2_glock *gl);
 
-extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
-			       u16 flags, struct gfs2_holder *gh,
-			       unsigned long ip);
+void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
+		        u16 flags, struct gfs2_holder *gh,
+		        unsigned long ip);
 static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
 				    u16 flags, struct gfs2_holder *gh) {
 	__gfs2_holder_init(gl, state, flags, gh, _RET_IP_);
 }
 
-extern void gfs2_holder_reinit(unsigned int state, u16 flags,
-			       struct gfs2_holder *gh);
-extern void gfs2_holder_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq(struct gfs2_holder *gh);
-extern int gfs2_glock_poll(struct gfs2_holder *gh);
-extern int gfs2_instantiate(struct gfs2_holder *gh);
-extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
-extern int gfs2_glock_wait(struct gfs2_holder *gh);
-extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
-extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
-extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
-			     const struct gfs2_glock_operations *glops,
-			     unsigned int state, u16 flags,
-			     struct gfs2_holder *gh);
-extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
+void gfs2_holder_reinit(unsigned int state, u16 flags,
+		        struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_instantiate(struct gfs2_holder *gh);
+int gfs2_glock_holder_ready(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+		      const struct gfs2_glock_operations *glops,
+		      unsigned int state, u16 flags,
+		      struct gfs2_holder *gh);
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
 			    bool fsid);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) {		\
 			gfs2_dump_glock(NULL, gl, true);	\
@@ -228,7 +213,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl,
 			gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \
 	while (0)
 
-extern __printf(2, 3)
+__printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
@@ -256,27 +241,27 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 	return error;
 }
 
-extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
-extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
-extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
-extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
-extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
-extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
-extern void gfs2_glock_free(struct gfs2_glock *gl);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
+void gfs2_cancel_delete_work(struct gfs2_glock *gl);
+void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
+void gfs2_glock_free(struct gfs2_glock *gl);
 
-extern int __init gfs2_glock_init(void);
-extern void gfs2_glock_exit(void);
+int __init gfs2_glock_init(void);
+void gfs2_glock_exit(void);
 
-extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-extern void gfs2_register_debugfs(void);
-extern void gfs2_unregister_debugfs(void);
+void gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+void gfs2_register_debugfs(void);
+void gfs2_unregister_debugfs(void);
 
-extern void glock_set_object(struct gfs2_glock *gl, void *object);
-extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+void glock_set_object(struct gfs2_glock *gl, void *object);
+void glock_clear_object(struct gfs2_glock *gl, void *object);
 
 extern const struct lm_lockops gfs2_dlm_ops;
 
@@ -295,7 +280,7 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
 	return !list_empty(&gh->gh_list);
 }
 
-extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
-extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
 
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f41ca89d216b..b41c78bd2cc0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	const struct gfs2_dinode *str = buf;
-	struct timespec64 atime;
+	struct timespec64 atime, iatime;
 	u16 height, depth;
 	umode_t mode = be32_to_cpu(str->di_mode);
 	struct inode *inode = &ip->i_inode;
@@ -433,10 +433,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
-	if (timespec64_compare(&inode->i_atime, &atime) < 0)
-		inode->i_atime = atime;
-	inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
-	inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+	iatime = inode_get_atime(inode);
+	if (timespec64_compare(&iatime, &atime) < 0)
+		inode_set_atime_to_ts(inode, atime);
+	inode_set_mtime(inode, be64_to_cpu(str->di_mtime),
+			be32_to_cpu(str->di_mtime_nsec));
 	inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
 			be32_to_cpu(str->di_ctime_nsec));
 
@@ -614,18 +615,6 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
 }
 
 /**
- * freeze_go_demote_ok
- * @gl: the glock
- *
- * Always returns 0
- */
-
-static int freeze_go_demote_ok(const struct gfs2_glock *gl)
-{
-	return 0;
-}
-
-/**
  * iopen_go_callback - schedule the dcache entry for the inode to be deleted
  * @gl: the glock
  * @remote: true if this came from a different cluster node
@@ -744,7 +733,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 
 const struct gfs2_glock_operations gfs2_freeze_glops = {
 	.go_xmote_bh = freeze_go_xmote_bh,
-	.go_demote_ok = freeze_go_demote_ok,
 	.go_callback = freeze_go_callback,
 	.go_type = LM_TYPE_NONDISK,
 	.go_flags = GLOF_NONDISK,
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 695898afcaf1..9341423798df 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -22,7 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
 extern const struct gfs2_glock_operations *gfs2_glops_list[];
 
-extern int gfs2_inode_metasync(struct gfs2_glock *gl);
-extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
+int gfs2_inode_metasync(struct gfs2_glock *gl);
+void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync);
 
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a8c95c5293c6..95a334d64da2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -863,7 +863,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
 	preempt_enable();
 }
 
-extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
+struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl);
 
 static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0eac04507904..1b95db2c3aac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
 
 		/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
-		inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
-		inode->i_atime.tv_nsec = 0;
+		inode_set_atime(inode,
+				1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1),
+				0);
 
 		glock_set_object(ip->i_gl, ip);
 
@@ -265,17 +266,18 @@ fail_iput:
 }
 
 
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+/**
+ * gfs2_lookup_meta - Look up an inode in a metadata directory
+ * @dip: The directory
+ * @name: The name of the inode
+ */
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name)
 {
 	struct qstr qstr;
 	struct inode *inode;
+
 	gfs2_str2qstr(&qstr, name);
 	inode = gfs2_lookupi(dip, &qstr, 1);
-	/* gfs2_lookupi has inconsistent callers: vfs
-	 * related routines expect NULL for no entry found,
-	 * gfs2_lookup_simple callers expect ENOENT
-	 * and do not check for NULL.
-	 */
 	if (IS_ERR_OR_NULL(inode))
 		return inode ? inode : ERR_PTR(-ENOENT);
 
@@ -417,7 +419,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
 	if (error)
 		goto out_ipreserv;
 
-	error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+	error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1);
 	if (error)
 		goto out_trans_end;
 
@@ -696,7 +698,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
 	inode->i_rdev = dev;
 	inode->i_size = size;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	munge_mode_uid_gid(dip, inode);
 	check_and_update_goal(dip);
 	ip->i_goal = dip->i_goal;
@@ -1866,16 +1868,24 @@ out:
 int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
 		    int mask)
 {
+	int may_not_block = mask & MAY_NOT_BLOCK;
 	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
+	struct gfs2_glock *gl;
 	int error;
 
 	gfs2_holder_mark_uninitialized(&i_gh);
 	ip = GFS2_I(inode);
-	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		if (mask & MAY_NOT_BLOCK)
+	gl = rcu_dereference_check(ip->i_gl, !may_not_block);
+	if (unlikely(!gl)) {
+		/* inode is getting torn down, must be RCU mode */
+		WARN_ON_ONCE(!may_not_block);
+		return -ECHILD;
+        }
+	if (gfs2_glock_is_locked_by_me(gl) == NULL) {
+		if (may_not_block)
 			return -ECHILD;
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
 			return error;
 	}
@@ -1920,7 +1930,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	kuid_t ouid, nuid;
 	kgid_t ogid, ngid;
 	int error;
-	struct gfs2_alloc_parms ap;
+	struct gfs2_alloc_parms ap = {};
 
 	ouid = inode->i_uid;
 	ogid = inode->i_gid;
@@ -2153,7 +2163,7 @@ static int gfs2_update_time(struct inode *inode, int flags)
 	int error;
 
 	gh = gfs2_glock_is_locked_by_me(gl);
-	if (gh && !gfs2_glock_is_held_excl(gl)) {
+	if (gh && gl->gl_state != LM_ST_EXCLUSIVE) {
 		gfs2_glock_dq(gh);
 		gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh);
 		error = gfs2_glock_nq(gh);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c8c5814e7295..fd15d1c6b6fb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -13,9 +13,9 @@
 #include "util.h"
 
 bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-			      char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
+ssize_t gfs2_internal_read(struct gfs2_inode *ip,
+			   char *buf, loff_t *pos, size_t size);
+void gfs2_set_aops(struct inode *inode);
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
@@ -44,19 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
 
 static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
 {
-	inode->i_blocks = blocks <<
-		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+	inode->i_blocks = blocks << (inode->i_blkbits - 9);
 }
 
 static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
 {
-	return inode->i_blocks >>
-		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+	return inode->i_blocks >> (inode->i_blkbits - 9);
 }
 
 static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
 {
-	change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT;
+	change <<= inode->i_blkbits - 9;
 	gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
 	inode->i_blocks += change;
 }
@@ -88,33 +86,33 @@ err:
 	return -EIO;
 }
 
-extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				       u64 no_addr, u64 no_formal_ino,
-				       unsigned int blktype);
-extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
-					 u64 no_formal_ino,
-					 unsigned int blktype);
-
-extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-				  int is_root);
-extern int gfs2_permission(struct mnt_idmap *idmap,
-			   struct inode *inode, int mask);
-extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern int gfs2_open_common(struct inode *inode, struct file *file);
-extern loff_t gfs2_seek_data(struct file *file, loff_t offset);
-extern loff_t gfs2_seek_hole(struct file *file, loff_t offset);
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
+			        u64 no_addr, u64 no_formal_ino,
+			        unsigned int blktype);
+struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
+				  u64 no_formal_ino,
+				  unsigned int blktype);
+
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+			   int is_root);
+int gfs2_permission(struct mnt_idmap *idmap,
+		    struct inode *inode, int mask);
+struct inode *gfs2_lookup_meta(struct inode *dip, const char *name);
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+int gfs2_open_common(struct inode *inode, struct file *file);
+loff_t gfs2_seek_data(struct file *file, loff_t offset);
+loff_t gfs2_seek_hole(struct file *file, loff_t offset);
 
 extern const struct file_operations gfs2_file_fops_nolock;
 extern const struct file_operations gfs2_dir_fops_nolock;
 
-extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-extern int gfs2_fileattr_set(struct mnt_idmap *idmap,
-			     struct dentry *dentry, struct fileattr *fa);
-extern void gfs2_set_inode_flags(struct inode *inode);
- 
+int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_set(struct mnt_idmap *idmap,
+		      struct dentry *dentry, struct fileattr *fa);
+void gfs2_set_inode_flags(struct inode *inode);
+
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
 extern const struct file_operations gfs2_file_fops;
 extern const struct file_operations gfs2_dir_fops;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 653cffcbf869..c27b05099c1e 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -70,29 +70,29 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
 	}
 }
 
-extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
-extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
-extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
-extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
-extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
-extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
-				 unsigned int *extra_revokes);
-extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
-			     unsigned int *extra_revokes);
-extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
-				  u64 seq, u32 tail, u32 lblock, u32 flags,
-				  blk_opf_t op_flags);
-extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
-			   u32 type);
-extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
-extern void log_flush_wait(struct gfs2_sbd *sdp);
+void gfs2_ordered_del_inode(struct gfs2_inode *ip);
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct);
+void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+bool gfs2_log_is_empty(struct gfs2_sbd *sdp);
+void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+			  unsigned int *extra_revokes);
+void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+		      unsigned int *extra_revokes);
+void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+			   u64 seq, u32 tail, u32 lblock, u32 flags,
+			   blk_opf_t op_flags);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+		    u32 type);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+void log_flush_wait(struct gfs2_sbd *sdp);
 
-extern int gfs2_logd(void *data);
-extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
-extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
-extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
+int gfs2_logd(void *data);
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
+void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+void gfs2_ail_drain(struct gfs2_sbd *sdp);
 
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 1412ffba1d44..07890c7b145d 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -11,16 +11,18 @@
 #include "incore.h"
 
 extern const struct gfs2_log_operations *gfs2_log_ops[];
-extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
-extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
-			   struct page *page, unsigned size, unsigned offset,
-			   u64 blkno);
-extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
-extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
-			   struct gfs2_log_header_host *head, bool keep_cache);
-extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
+void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
+void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+		    struct page *page, unsigned size, unsigned offset,
+		    u64 blkno);
+void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+		    struct gfs2_log_header_host *head, bool keep_cache);
+void gfs2_drain_revokes(struct gfs2_sbd *sdp);
+
 static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
 {
 	return sdp->sd_ldptrs;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 66eb98b690a2..79be0cdc730c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_trans_cachep)
 		goto fail_cachep8;
 
-	error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
+	error = gfs2_qd_shrinker_init();
 	if (error)
 		goto fail_shrinker;
 
@@ -196,7 +196,7 @@ fail_wq3:
 fail_wq2:
 	destroy_workqueue(gfs2_recovery_wq);
 fail_wq1:
-	unregister_shrinker(&gfs2_qd_shrinker);
+	gfs2_qd_shrinker_exit();
 fail_shrinker:
 	kmem_cache_destroy(gfs2_trans_cachep);
 fail_cachep8:
@@ -229,7 +229,7 @@ fail_lru:
 
 static void __exit exit_gfs2_fs(void)
 {
-	unregister_shrinker(&gfs2_qd_shrinker);
+	gfs2_qd_shrinker_exit();
 	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 924361fa510b..25ceb0805df2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	struct page *page;
+	struct folio *folio;
 	struct buffer_head *bh;
 	unsigned int shift;
 	unsigned long index;
@@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	bufnum = blkno - (index << shift);  /* block buf index within page */
 
 	if (create) {
-		for (;;) {
-			page = grab_cache_page(mapping, index);
-			if (page)
-				break;
-			yield();
-		}
-		if (!page_has_buffers(page))
-			create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+		folio = __filemap_get_folio(mapping, index,
+				FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+				mapping_gfp_mask(mapping) | __GFP_NOFAIL);
+		bh = folio_buffers(folio);
+		if (!bh)
+			bh = create_empty_buffers(folio,
+				sdp->sd_sb.sb_bsize, 0);
 	} else {
-		page = find_get_page_flags(mapping, index,
-						FGP_LOCK|FGP_ACCESSED);
-		if (!page)
+		folio = __filemap_get_folio(mapping, index,
+				FGP_LOCK | FGP_ACCESSED, 0);
+		if (IS_ERR(folio))
 			return NULL;
-		if (!page_has_buffers(page)) {
-			bh = NULL;
-			goto out_unlock;
-		}
+		bh = folio_buffers(folio);
 	}
 
-	/* Locate header for our buffer within our page */
-	for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
-		/* Do nothing */;
-	get_bh(bh);
+	if (!bh)
+		goto out_unlock;
 
+	bh = get_nth_bh(bh, bufnum);
 	if (!buffer_mapped(bh))
 		map_bh(bh, sdp->sd_vfs, blkno);
 
 out_unlock:
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 
 	return bh;
 }
@@ -405,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno)
 {
 	struct address_space *mapping = ip->i_inode.i_mapping;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct page *page;
+	struct folio *folio;
 	struct buffer_head *bh;
 	unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
 	unsigned long index = blkno >> shift; /* convert block to page */
 	unsigned int bufnum = blkno - (index << shift);
 
-	page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED);
-	if (!page)
+	folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0);
+	if (IS_ERR(folio))
 		return NULL;
-	if (!page_has_buffers(page)) {
-		unlock_page(page);
-		put_page(page);
-		return NULL;
-	}
-	/* Locate header for our buffer within our page */
-	for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
-		/* Do nothing */;
-	get_bh(bh);
-	unlock_page(page);
-	put_page(page);
+	bh = folio_buffers(folio);
+	if (bh)
+		bh = get_nth_bh(bh, bufnum);
+	folio_unlock(folio);
+	folio_put(folio);
 	return bh;
 }
 
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index d0a58cdd433a..831d988c2ceb 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,21 +50,21 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 		return inode->i_sb->s_fs_info;
 }
 
-extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
-extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-			  int rahead, struct buffer_head **bhp);
-extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
-				       int create);
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+		   int rahead, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
+			        int create);
 enum {
 	REMOVE_JDATA = 0,
 	REMOVE_META = 1,
 };
 
-extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
-extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
-extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
-			    struct buffer_head **bhp);
+void gfs2_remove_from_journal(struct buffer_head *bh, int meta);
+void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num,
+		     struct buffer_head **bhp);
 
 static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
 					 struct buffer_head **bhp)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 33ca04733e93..b108c5d26839 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -292,8 +292,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 		return error;
 	}
 
-	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
 	sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
 	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
 			  sizeof(struct gfs2_dinode)) / sizeof(u64);
@@ -648,7 +647,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
 	struct gfs2_jdesc *jd;
 	struct gfs2_inode *ip;
 
-	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
+	sdp->sd_statfs_inode = gfs2_lookup_meta(master, "statfs");
 	if (IS_ERR(sdp->sd_statfs_inode)) {
 		error = PTR_ERR(sdp->sd_statfs_inode);
 		fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -657,7 +656,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
 	if (sdp->sd_args.ar_spectator)
 		goto out;
 
-	pn = gfs2_lookup_simple(master, "per_node");
+	pn = gfs2_lookup_meta(master, "per_node");
 	if (IS_ERR(pn)) {
 		error = PTR_ERR(pn);
 		fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -674,7 +673,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
 			goto free_local;
 		}
 		sprintf(buf, "statfs_change%u", jd->jd_jid);
-		lsi->si_sc_inode = gfs2_lookup_simple(pn, buf);
+		lsi->si_sc_inode = gfs2_lookup_meta(pn, buf);
 		if (IS_ERR(lsi->si_sc_inode)) {
 			error = PTR_ERR(lsi->si_sc_inode);
 			fs_err(sdp, "can't find local \"sc\" file#%u: %d\n",
@@ -739,7 +738,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	if (undo)
 		goto fail_statfs;
 
-	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
+	sdp->sd_jindex = gfs2_lookup_meta(master, "jindex");
 	if (IS_ERR(sdp->sd_jindex)) {
 		fs_err(sdp, "can't lookup journal index: %d\n", error);
 		return PTR_ERR(sdp->sd_jindex);
@@ -888,7 +887,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		goto fail;
 
 	/* Read in the resource index inode */
-	sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
+	sdp->sd_rindex = gfs2_lookup_meta(master, "rindex");
 	if (IS_ERR(sdp->sd_rindex)) {
 		error = PTR_ERR(sdp->sd_rindex);
 		fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -897,7 +896,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
-	sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
+	sdp->sd_quota_inode = gfs2_lookup_meta(master, "quota");
 	if (IS_ERR(sdp->sd_quota_inode)) {
 		error = PTR_ERR(sdp->sd_quota_inode);
 		fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -941,7 +940,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	if (undo)
 		goto fail_qc_gh;
 
-	pn = gfs2_lookup_simple(master, "per_node");
+	pn = gfs2_lookup_meta(master, "per_node");
 	if (IS_ERR(pn)) {
 		error = PTR_ERR(pn);
 		fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -949,7 +948,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	}
 
 	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
-	sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+	sdp->sd_qc_inode = gfs2_lookup_meta(pn, buf);
 	if (IS_ERR(sdp->sd_qc_inode)) {
 		error = PTR_ERR(sdp->sd_qc_inode);
 		fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
@@ -1126,8 +1125,7 @@ static int init_threads(struct gfs2_sbd *sdp)
 	return 0;
 
 fail:
-	kthread_stop(sdp->sd_logd_process);
-	put_task_struct(sdp->sd_logd_process);
+	kthread_stop_put(sdp->sd_logd_process);
 	sdp->sd_logd_process = NULL;
 	return error;
 }
@@ -1135,13 +1133,11 @@ fail:
 void gfs2_destroy_threads(struct gfs2_sbd *sdp)
 {
 	if (sdp->sd_logd_process) {
-		kthread_stop(sdp->sd_logd_process);
-		put_task_struct(sdp->sd_logd_process);
+		kthread_stop_put(sdp->sd_logd_process);
 		sdp->sd_logd_process = NULL;
 	}
 	if (sdp->sd_quotad_process) {
-		kthread_stop(sdp->sd_quotad_process);
-		put_task_struct(sdp->sd_quotad_process);
+		kthread_stop_put(sdp->sd_quotad_process);
 		sdp->sd_quotad_process = NULL;
 	}
 }
@@ -1190,10 +1186,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	/* Set up the buffer cache and fill in some fake block size values
 	   to allow us to read-in the on-disk superblock. */
-	sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+	sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512);
 	sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
-	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-                               GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
 	sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
 
 	sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
@@ -1281,10 +1276,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	if (!sb_rdonly(sb)) {
 		error = init_threads(sdp);
-		if (error) {
-			gfs2_withdraw_delayed(sdp);
+		if (error)
 			goto fail_per_node;
-		}
 	}
 
 	error = gfs2_freeze_lock_shared(sdp);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 171b2713d2e5..95dae7838b4e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -196,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
 	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
-struct shrinker gfs2_qd_shrinker = {
-	.count_objects = gfs2_qd_shrink_count,
-	.scan_objects = gfs2_qd_shrink_scan,
-	.seeks = DEFAULT_SEEKS,
-	.flags = SHRINKER_NUMA_AWARE,
-};
+static struct shrinker *gfs2_qd_shrinker;
+
+int __init gfs2_qd_shrinker_init(void)
+{
+	gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd");
+	if (!gfs2_qd_shrinker)
+		return -ENOMEM;
+
+	gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count;
+	gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan;
+
+	shrinker_register(gfs2_qd_shrinker);
 
+	return 0;
+}
+
+void gfs2_qd_shrinker_exit(void)
+{
+	shrinker_free(gfs2_qd_shrinker);
+}
 
 static u64 qd2index(struct gfs2_quota_data *qd)
 {
@@ -457,6 +470,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
 	    (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
 		return 0;
 
+	/*
+	 * If qd_change is 0 it means a pending quota change was negated.
+	 * We should not sync it, but we still have a qd reference and slot
+	 * reference taken by gfs2_quota_change -> do_qc that need to be put.
+	 */
+	if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) {
+		slot_put(qd);
+		qd_put(qd);
+		return 0;
+	}
+
 	if (!lockref_get_not_dead(&qd->qd_lockref))
 		return 0;
 
@@ -736,7 +760,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
 	struct inode *inode = &ip->i_inode;
 	struct address_space *mapping = inode->i_mapping;
-	struct page *page;
+	struct folio *folio;
 	struct buffer_head *bh;
 	u64 blk;
 	unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
@@ -745,15 +769,15 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
 	blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
 	boff = off % bsize;
 
-	page = grab_cache_page(mapping, index);
-	if (!page)
-		return -ENOMEM;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, bsize, 0);
+	folio = filemap_grab_folio(mapping, index);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+	bh = folio_buffers(folio);
+	if (!bh)
+		bh = create_empty_buffers(folio, bsize, 0);
 
-	bh = page_buffers(page);
-	for(;;) {
-		/* Find the beginning block within the page */
+	for (;;) {
+		/* Find the beginning block within the folio */
 		if (pg_off >= ((bnum * bsize) + bsize)) {
 			bh = bh->b_this_page;
 			bnum++;
@@ -766,9 +790,10 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
 				goto unlock_out;
 			/* If it's a newly allocated disk block, zero it */
 			if (buffer_new(bh))
-				zero_user(page, bnum * bsize, bh->b_size);
+				folio_zero_range(folio, bnum * bsize,
+						bh->b_size);
 		}
-		if (PageUptodate(page))
+		if (folio_test_uptodate(folio))
 			set_buffer_uptodate(bh);
 		if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
 			goto unlock_out;
@@ -784,17 +809,17 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
 		break;
 	}
 
-	/* Write to the page, now that we have setup the buffer(s) */
-	memcpy_to_page(page, off, buf, bytes);
-	flush_dcache_page(page);
-	unlock_page(page);
-	put_page(page);
+	/* Write to the folio, now that we have setup the buffer(s) */
+	memcpy_to_folio(folio, off, buf, bytes);
+	flush_dcache_folio(folio);
+	folio_unlock(folio);
+	folio_put(folio);
 
 	return 0;
 
 unlock_out:
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	return -EIO;
 }
 
@@ -886,7 +911,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
 		size = loc + sizeof(struct gfs2_quota);
 		if (size > inode->i_size)
 			i_size_write(inode, size);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		mark_inode_dirty(inode);
 		set_bit(QDF_REFRESH, &qd->qd_flags);
 	}
@@ -898,7 +923,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 {
 	struct gfs2_sbd *sdp = (*qda)->qd_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
-	struct gfs2_alloc_parms ap = { .aflags = 0, };
+	struct gfs2_alloc_parms ap = {};
 	unsigned int data_blocks, ind_blocks;
 	struct gfs2_holder *ghs, i_gh;
 	unsigned int qx, x;
@@ -1072,8 +1097,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 	u32 x;
 	int error;
 
-	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
-	    sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return 0;
 
 	error = gfs2_quota_hold(ip, uid, gid);
@@ -1180,17 +1204,16 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
 
 #define MAX_LINE 256
 
-static int print_message(struct gfs2_quota_data *qd, char *type)
+static void print_message(struct gfs2_quota_data *qd, char *type)
 {
 	struct gfs2_sbd *sdp = qd->qd_sbd;
 
-	if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+	if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) {
 		fs_info(sdp, "quota %s for %s %u\n",
 			type,
 			(qd->qd_id.type == USRQUOTA) ? "user" : "group",
 			from_kqid(&init_user_ns, qd->qd_id));
-
-	return 0;
+	}
 }
 
 /**
@@ -1260,7 +1283,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
 					 * HZ)) {
 			quota_send_warning(qd->qd_id,
 					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
-			error = print_message(qd, "warning");
+			print_message(qd, "warning");
+			error = 0;
 			qd->qd_last_warn = jiffies;
 		}
 	}
@@ -1274,8 +1298,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 	u32 x;
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 
-	if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
-	    sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) ||
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ||
 	    gfs2_assert_warn(sdp, change))
 		return;
 	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
@@ -1732,7 +1755,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	if (gfs2_is_stuffed(ip))
 		alloc_required = 1;
 	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .aflags = 0, };
+		struct gfs2_alloc_parms ap = {};
 		gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
 				       &data_blocks, &ind_blocks);
 		blocks = 1 + data_blocks + ind_blocks;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 1429945215a0..f462d9cb3087 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,27 +15,27 @@ struct gfs2_sbd;
 #define NO_UID_QUOTA_CHANGE INVALID_UID
 #define NO_GID_QUOTA_CHANGE INVALID_GID
 
-extern int gfs2_qa_get(struct gfs2_inode *ip);
-extern void gfs2_qa_put(struct gfs2_inode *ip);
-extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unhold(struct gfs2_inode *ip);
+int gfs2_qa_get(struct gfs2_inode *ip);
+void gfs2_qa_put(struct gfs2_inode *ip);
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
 
-extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
-extern void gfs2_quota_unlock(struct gfs2_inode *ip);
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
 
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
-			    struct gfs2_alloc_parms *ap);
-extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-			      kuid_t uid, kgid_t gid);
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+		     struct gfs2_alloc_parms *ap);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+		       kuid_t uid, kgid_t gid);
 
-extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
+int gfs2_quota_sync(struct super_block *sb, int type);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
 
-extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-extern int gfs2_quotad(void *data);
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+int gfs2_quotad(void *data);
 
-extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
+void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
 
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
 					struct gfs2_alloc_parms *ap)
@@ -50,8 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
 	ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (ret)
 		return ret;
-	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
-	    sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
 		return 0;
 	ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
 	if (ret)
@@ -60,8 +59,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
 }
 
 extern const struct quotactl_ops gfs2_quotactl_ops;
-extern struct shrinker gfs2_qd_shrinker;
+int __init gfs2_qd_shrinker_init(void);
+void gfs2_qd_shrinker_exit(void);
 extern struct list_lru gfs2_qd_lru;
-extern void __init gfs2_quota_hash_init(void);
+
+void __init gfs2_quota_hash_init(void);
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 7a0c9d0b7503..6a0fd42e1120 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -17,18 +17,18 @@ static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk)
 	        *blk = 0;
 }
 
-extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
 			   struct buffer_head **bh);
 
-extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_jdesc *jd);
 
-extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
-extern void gfs2_recover_func(struct work_struct *work);
-extern int __get_log_header(struct gfs2_sbd *sdp,
-			    const struct gfs2_log_header *lh, unsigned int blkno,
-			    struct gfs2_log_header_host *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
+void gfs2_recover_func(struct work_struct *work);
+int __get_log_header(struct gfs2_sbd *sdp,
+		     const struct gfs2_log_header *lh, unsigned int blkno,
+		     struct gfs2_log_header_host *head);
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9308190895c8..c2060203b98a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2411,13 +2411,12 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
  * @bn: Used to return the starting block number
  * @nblocks: requested number of blocks/extent length (value/result)
  * @dinode: 1 if we're allocating a dinode block, else 0
- * @generation: the generation number of the inode
  *
  * Returns: 0 or error
  */
 
 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
-		      bool dinode, u64 *generation)
+		      bool dinode)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
@@ -2477,10 +2476,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	rbm.rgd->rd_free -= *nblocks;
 	spin_unlock(&rbm.rgd->rd_rsspin);
 	if (dinode) {
+		u64 generation;
+
 		rbm.rgd->rd_dinodes++;
-		*generation = rbm.rgd->rd_igeneration++;
-		if (*generation == 0)
-			*generation = rbm.rgd->rd_igeneration++;
+		generation = rbm.rgd->rd_igeneration++;
+		if (generation == 0)
+			generation = rbm.rgd->rd_igeneration++;
+		ip->i_generation = generation;
 	}
 
 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 00b30cf893af..8d20e99385db 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -22,38 +22,38 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
 
-extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 
-extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
-extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 
-extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
-extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
-extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_update(struct gfs2_sbd *sdp);
+void gfs2_free_clones(struct gfs2_rgrpd *rgd);
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
 
-extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 
 #define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
-				struct gfs2_alloc_parms *ap);
-extern void gfs2_inplace_release(struct gfs2_inode *ip);
-
-extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
-			     bool dinode, u64 *generation);
-
-extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
-extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
-			       u64 bstart, u32 blen, int meta);
-extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
-			   u64 bstart, u32 blen);
-extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-extern void gfs2_unlink_di(struct inode *inode);
-extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
-			       unsigned int type);
+int gfs2_inplace_reserve(struct gfs2_inode *ip,
+			 struct gfs2_alloc_parms *ap);
+void gfs2_inplace_release(struct gfs2_inode *ip);
+
+int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
+		      bool dinode);
+
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
+void gfs2_rs_delete(struct gfs2_inode *ip);
+void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+		        u64 bstart, u32 blen, int meta);
+void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+		    u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+		        unsigned int type);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
@@ -62,18 +62,19 @@ struct gfs2_rgrp_list {
 	struct gfs2_holder *rl_ghs;
 };
 
-extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
-			   u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
-			     unsigned int state, u16 flags);
-extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
-			   const char *fs_id_buf);
-extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
-				   struct buffer_head *bh,
-				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
-extern int gfs2_fitrim(struct file *filp, void __user *argp);
+void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
+		    u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+		      unsigned int state, u16 flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
+		    const char *fs_id_buf);
+int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+			    struct buffer_head *bh,
+			    const struct gfs2_bitmap *bi, unsigned minlen,
+			    u64 *ptrimmed);
+int gfs2_fitrim(struct file *filp, void __user *argp);
 
 /* This is how to tell if a reservation is in the rgrp tree: */
 static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
@@ -88,9 +89,9 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
 	return first <= block && block < last;
 }
 
-extern void check_and_update_goal(struct gfs2_inode *ip);
+void check_and_update_goal(struct gfs2_inode *ip);
 
-extern void rgrp_lock_local(struct gfs2_rgrpd *rgd);
-extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
+void rgrp_lock_local(struct gfs2_rgrpd *rgd);
+void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
 
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 02d93da21b2b..d21c04a22d73 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_nlink = cpu_to_be32(inode->i_nlink);
 	str->di_size = cpu_to_be64(i_size_read(inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
-	str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
-	str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
-	str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
+	str->di_atime = cpu_to_be64(inode_get_atime_sec(inode));
+	str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode));
+	str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode));
 
 	str->di_goal_meta = cpu_to_be64(ip->i_goal);
 	str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_entries = cpu_to_be32(ip->i_entries);
 
 	str->di_eattr = cpu_to_be64(ip->i_eattr);
-	str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
-	str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
-	str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
+	str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode));
+	str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode));
+	str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode));
 }
 
 /**
@@ -602,13 +602,15 @@ restart:
 	}
 	spin_unlock(&sdp->sd_jindex_spin);
 
-	if (!sb_rdonly(sb)) {
+	if (!sb_rdonly(sb))
 		gfs2_make_fs_ro(sdp);
-	}
-	if (gfs2_withdrawn(sdp)) {
-		gfs2_destroy_threads(sdp);
+	else {
+		if (gfs2_withdrawn(sdp))
+			gfs2_destroy_threads(sdp);
+
 		gfs2_quota_cleanup(sdp);
 	}
+
 	WARN_ON(gfs2_withdrawing(sdp));
 
 	/*  At this point, we're through modifying the disk  */
@@ -1006,6 +1008,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = sc.sc_dinodes + sc.sc_free;
 	buf->f_ffree = sc.sc_free;
 	buf->f_namelen = GFS2_FNAMESIZE;
+	buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
 
 	return 0;
 }
@@ -1299,18 +1302,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
 	 * As a last resort, if another node keeps holding the iopen glock
 	 * without showing any activity on the inode glock, we will eventually
 	 * time out and fail the iopen glock upgrade.
-	 *
-	 * Note that we're passing the LM_FLAG_TRY_1CB flag to the first
-	 * locking request as an optimization to notify lock holders as soon as
-	 * possible.  Without that flag, they'd be notified implicitly by the
-	 * second locking request.
 	 */
 
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh);
-	error = gfs2_glock_nq(gh);
-	if (error != GLR_TRYFAILED)
-		return !error;
-
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh);
 	error = gfs2_glock_nq(gh);
 	if (error)
@@ -1550,7 +1543,7 @@ out:
 		wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
 		gfs2_glock_add_to_lru(ip->i_gl);
 		gfs2_glock_put_eventually(ip->i_gl);
-		ip->i_gl = NULL;
+		rcu_assign_pointer(ip->i_gl, NULL);
 	}
 }
 
@@ -1576,7 +1569,7 @@ static void gfs2_free_inode(struct inode *inode)
 	kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode));
 }
 
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
+void free_local_statfs_inodes(struct gfs2_sbd *sdp)
 {
 	struct local_statfs_inode *lsi, *safe;
 
@@ -1591,8 +1584,8 @@ extern void free_local_statfs_inodes(struct gfs2_sbd *sdp)
 	}
 }
 
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
-					     unsigned int index)
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+				      unsigned int index)
 {
 	struct local_statfs_inode *lsi;
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ab9c83106932..b27a774d9580 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -15,7 +15,7 @@
 #define GFS2_FS_FORMAT_MIN (1801)
 #define GFS2_FS_FORMAT_MAX (1802)
 
-extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
@@ -26,33 +26,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
-extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
-				     struct gfs2_inode **ipp);
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+			      struct gfs2_inode **ipp);
 
-extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
-extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
-extern void gfs2_destroy_threads(struct gfs2_sbd *sdp);
-extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
-extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
-			       s64 dinodes);
-extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
-				  const void *buf);
-extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
-				   void *buf);
-extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
-extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern void gfs2_freeze_func(struct work_struct *work);
-extern void gfs2_thaw_freeze_initiator(struct super_block *sb);
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+void gfs2_online_uevent(struct gfs2_sbd *sdp);
+void gfs2_destroy_threads(struct gfs2_sbd *sdp);
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+		        s64 dinodes);
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+			   const void *buf);
+void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc,
+			    void *buf);
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh);
+int gfs2_statfs_sync(struct super_block *sb, int type);
+void gfs2_freeze_func(struct work_struct *work);
+void gfs2_thaw_freeze_initiator(struct super_block *sb);
 
-extern void free_local_statfs_inodes(struct gfs2_sbd *sdp);
-extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
-					     unsigned int index);
-extern void free_sbd(struct gfs2_sbd *sdp);
+void free_local_statfs_inodes(struct gfs2_sbd *sdp);
+struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp,
+				      unsigned int index);
+void free_sbd(struct gfs2_sbd *sdp);
 
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
@@ -60,8 +60,8 @@ extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
 
-extern const struct xattr_handler *gfs2_xattr_handlers_max[];
-extern const struct xattr_handler **gfs2_xattr_handlers_min;
+extern const struct xattr_handler * const gfs2_xattr_handlers_max[];
+extern const struct xattr_handler * const *gfs2_xattr_handlers_min;
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index c76ad9a4c75a..f8ce5302280d 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -34,17 +34,17 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned
 	return rgd->rd_length;
 }
 
-extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
-			      unsigned int blocks, unsigned int revokes,
-			      unsigned long ip);
-extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
-			    unsigned int revokes);
-
-extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
-extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
-extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
-extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp,
+		       unsigned int blocks, unsigned int revokes,
+		       unsigned long ip);
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
+void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr);
 
 #endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cdb839529175..11c9d59b6889 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,10 +147,10 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
 int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
 		    char *file, unsigned int line);
 
-extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
-			       bool verbose);
-extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
-extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
+int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+		        bool verbose);
+int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp);
+void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh);
 
 #define gfs2_io_error(sdp) \
 gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4fea70c0fe3d..8c96ba6230d1 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -639,7 +639,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	u64 block;
 	int error;
 
-	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+	error = gfs2_alloc_blocks(ip, &block, &n, 0);
 	if (error)
 		return error;
 	gfs2_trans_remove_revoke(sdp, block, 1);
@@ -701,7 +701,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
-			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
+			error = gfs2_alloc_blocks(ip, &block, &n, 0);
 			if (error)
 				return error;
 			gfs2_trans_remove_revoke(sdp, block, 1);
@@ -1002,7 +1002,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 		unsigned int n = 1;
-		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
+		error = gfs2_alloc_blocks(ip, &blk, &n, 0);
 		if (error)
 			return error;
 		gfs2_trans_remove_revoke(sdp, blk, 1);
@@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = {
 	.set    = gfs2_xattr_set,
 };
 
-const struct xattr_handler *gfs2_xattr_handlers_max[] = {
+const struct xattr_handler * const gfs2_xattr_handlers_max[] = {
 	/* GFS2_FS_FORMAT_MAX */
 	&gfs2_xattr_trusted_handler,
 
@@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = {
 	NULL,
 };
 
-const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
+const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 2aed9d7d483d..eb12eb7e37c1 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -50,14 +50,14 @@ struct gfs2_ea_location {
 	struct gfs2_ea_header *el_prev;
 };
 
-extern int __gfs2_xattr_set(struct inode *inode, const char *name,
-			    const void *value, size_t size,
-			    int flags, int type);
-extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
+int __gfs2_xattr_set(struct inode *inode, const char *name,
+		     const void *value, size_t size,
+		     int flags, int type);
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
 
 /* Exported to acl.c */
 
-extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
+int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
 
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 6341bb248247..f8395cdd1adf 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = {
 	.set = hfs_xattr_set,
 };
 
-const struct xattr_handler *hfs_xattr_handlers[] = {
+const struct xattr_handler * const hfs_xattr_handlers[] = {
 	&hfs_creator_handler,
 	&hfs_type_handler,
 	NULL
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 632c226a3972..d63880e7d9d6 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
 		goto err1;
 
 	dir->i_size++;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	hfs_find_exit(&fd);
 	return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
 	}
 
 	dir->i_size--;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	res = 0;
 out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
 	if (err)
 		goto out;
 	dst_dir->i_size++;
-	dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+	inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
 	mark_inode_dirty(dst_dir);
 
 	/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
 	if (err)
 		goto out;
 	src_dir->i_size--;
-	src_dir->i_mtime = inode_set_ctime_current(src_dir);
+	inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
 	mark_inode_dirty(src_dir);
 
 	type = entry.type;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 49d02524e667..b5a6ad5df357 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *);
 extern void hfs_delete_inode(struct inode *);
 
 /* attr.c */
-extern const struct xattr_handler *hfs_xattr_handlers[];
+extern const struct xattr_handler * const hfs_xattr_handlers[];
 
 /* mdb.c */
 extern int hfs_mdb_get(struct super_block *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index ee349b72cfb3..a7bc4690a780 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	set_nlink(inode, 1);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
 			inode->i_mode |= S_IWUGO;
 		inode->i_mode &= ~hsb->s_file_umask;
 		inode->i_mode |= S_IFREG;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
-									hfs_m_to_utime(rec->file.MdDat));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
 		inode->i_op = &hfs_file_inode_operations;
 		inode->i_fop = &hfs_file_operations;
 		inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
 		inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
 		HFS_I(inode)->fs_blocks = 0;
 		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
-									hfs_m_to_utime(rec->dir.MdDat));
+		inode_set_mtime_to_ts(inode,
+				      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
 		inode->i_op = &hfs_dir_inode_operations;
 		inode->i_fop = &hfs_dir_operations;
 		break;
@@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
 		}
 
-		rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
 		rec.dir.Val = cpu_to_be16(inode->i_size - 2);
 
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
@@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		else
 			rec.file.Flags |= HFS_FIL_LOCK;
 		hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
-		rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+		rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
 
 		hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
 			    sizeof(struct hfs_cat_file));
@@ -654,7 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 
 		truncate_setsize(inode, attr->ia_size);
 		hfs_file_truncate(inode);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index dc27d418fbcd..76fa02e3835b 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,11 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
 	/* fix up inode on a timezone change */
 	diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
 	if (diff) {
-		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 ts = inode_get_ctime(inode);
 
-		inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
-		inode->i_atime.tv_sec += diff;
-		inode->i_mtime.tv_sec += diff;
+		inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec);
+		ts = inode_get_atime(inode);
+		inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec);
+		ts = inode_get_mtime(inode);
+		inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec);
 		HFS_I(inode)->tz_secondswest += diff;
 	}
 	return 1;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index e71ae2537eaa..1995bafee839 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 	dir->i_size++;
 	if (S_ISDIR(inode->i_mode))
 		hfsplus_subfolders_inc(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 
 	hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
 	dir->i_size--;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_dec(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
 
 	if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
 	dst_dir->i_size++;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_inc(dst_dir);
-	dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+	inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
 
 	/* finally remove the old entry */
 	err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
 	src_dir->i_size--;
 	if (type == HFSPLUS_FOLDER)
 		hfsplus_subfolders_dec(src_dir);
-	src_dir->i_mtime = inode_set_ctime_current(src_dir);
+	inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
 
 	/* remove old thread entry */
 	hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c65c8c4b03dd..702a0663b1d8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
 		}
 		truncate_setsize(inode, attr->ia_size);
 		hfsplus_file_truncate(inode);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	}
 
 	setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
 	inode->i_ino = sbi->next_cnid++;
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	set_nlink(inode, 1);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	hip = HFSPLUS_I(inode);
 	INIT_LIST_HEAD(&hip->open_dir_list);
@@ -521,8 +521,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		hfsplus_get_perms(inode, &folder->permissions, 1);
 		set_nlink(inode, 1);
 		inode->i_size = 2 + be32_to_cpu(folder->valence);
-		inode->i_atime = hfsp_mt2ut(folder->access_date);
-		inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+		inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date));
+		inode_set_mtime_to_ts(inode,
+				      hfsp_mt2ut(folder->content_mod_date));
 		inode_set_ctime_to_ts(inode,
 				      hfsp_mt2ut(folder->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = folder->create_date;
@@ -563,8 +564,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 			init_special_inode(inode, inode->i_mode,
 					   be32_to_cpu(file->permissions.dev));
 		}
-		inode->i_atime = hfsp_mt2ut(file->access_date);
-		inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+		inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date));
+		inode_set_mtime_to_ts(inode,
+				      hfsp_mt2ut(file->content_mod_date));
 		inode_set_ctime_to_ts(inode,
 				      hfsp_mt2ut(file->attribute_mod_date));
 		HFSPLUS_I(inode)->create_date = file->create_date;
@@ -609,8 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
 					sizeof(struct hfsplus_cat_folder));
 		/* simple node checks? */
 		hfsplus_cat_set_perms(inode, &folder->permissions);
-		folder->access_date = hfsp_ut2mt(inode->i_atime);
-		folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		folder->access_date = hfsp_ut2mt(inode_get_atime(inode));
+		folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
 		folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
 		folder->valence = cpu_to_be32(inode->i_size - 2);
 		if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -644,8 +646,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
 			file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
 		else
 			file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
-		file->access_date = hfsp_ut2mt(inode->i_atime);
-		file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+		file->access_date = hfsp_ut2mt(inode_get_atime(inode));
+		file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
 		file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
 		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
 					 sizeof(struct hfsplus_cat_file));
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 58021e73c00b..9c9ff6b8c6f7 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -13,7 +13,7 @@
 
 static int hfsplus_removexattr(struct inode *inode, const char *name);
 
-const struct xattr_handler *hfsplus_xattr_handlers[] = {
+const struct xattr_handler * const hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_osx_handler,
 	&hfsplus_xattr_user_handler,
 	&hfsplus_xattr_trusted_handler,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d14e362b3eba..15cc55e41410 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler;
 extern const struct xattr_handler hfsplus_xattr_trusted_handler;
 extern const struct xattr_handler hfsplus_xattr_security_handler;
 
-extern const struct xattr_handler *hfsplus_xattr_handlers[];
+extern const struct xattr_handler * const hfsplus_xattr_handlers[];
 
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dc5a5cea5fae..ea87f24c6c3f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -513,10 +513,14 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
 	set_nlink(ino, st->nlink);
 	i_uid_write(ino, st->uid);
 	i_gid_write(ino, st->gid);
-	ino->i_atime =
-		(struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
-	ino->i_mtime =
-		(struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
+	inode_set_atime_to_ts(ino, (struct timespec64){
+			st->atime.tv_sec,
+			st->atime.tv_nsec,
+		});
+	inode_set_mtime_to_ts(ino, (struct timespec64){
+			st->mtime.tv_sec,
+			st->mtime.tv_nsec,
+		});
 	inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
 	ino->i_size = st->size;
 	ino->i_blocks = st->blocks;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f36566d61215..49dd585c2b17 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
 	 * inode.
 	 */
 
-	if (!inode_get_ctime(result).tv_sec) {
+	if (!inode_get_ctime_sec(result)) {
 		time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
 
 		inode_set_ctime(result, csec ? csec : 1, 0);
-		result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
-		result->i_mtime.tv_nsec = 0;
-		result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
-		result->i_atime.tv_nsec = 0;
+		inode_set_mtime(result,
+				local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)),
+				0);
+		inode_set_atime(result,
+				local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)),
+				0);
 		hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
 		if (!hpfs_result->i_ea_mode && de->read_only)
 			result->i_mode &= ~0222;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 479166378bae..a59e8fa630db 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -37,8 +37,8 @@ void hpfs_init_inode(struct inode *i)
 	hpfs_inode->i_dirty = 0;
 
 	inode_set_ctime(i, 0, 0);
-	i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
-	i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+	inode_set_mtime(i, 0, 0);
+	inode_set_atime(i, 0, 0);
 }
 
 void hpfs_read_inode(struct inode *i)
@@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i)
 	}
 	hpfs_write_inode_ea(i, fnode);
 	if (de) {
-		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
-		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
-		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+		de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+		de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+		de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
 		de->read_only = !(i->i_mode & 0222);
 		de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
 		hpfs_mark_4buffers_dirty(&qbh);
@@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i)
 	}
 	if (S_ISDIR(i->i_mode)) {
 		if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
-			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
-			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
-			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+			de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+			de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+			de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
 			de->read_only = !(i->i_mode & 0222);
 			de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
 			de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4eb8d6f5989..9184b4584b01 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -12,10 +12,10 @@
 static void hpfs_update_directory_times(struct inode *dir)
 {
 	time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
-	if (t == dir->i_mtime.tv_sec &&
-	    t == inode_get_ctime(dir).tv_sec)
+	if (t == inode_get_mtime_sec(dir) &&
+	    t == inode_get_ctime_sec(dir))
 		return;
-	dir->i_mtime = inode_set_ctime(dir, t, 0);
+	inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0));
 	hpfs_write_inode_nolock(dir);
 }
 
@@ -58,8 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_ino = fno;
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
 	hpfs_i(result)->i_dno = dno;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_mode |= S_IFDIR;
 	result->i_op = &hpfs_dir_iops;
@@ -164,8 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_fop = &hpfs_file_ops;
 	set_nlink(result, 1);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	if (dee.read_only)
 		result->i_mode &= ~0222;
@@ -245,8 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	hpfs_init_inode(result);
 	result->i_ino = fno;
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_uid = current_fsuid();
 	result->i_gid = current_fsgid();
@@ -319,8 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	result->i_ino = fno;
 	hpfs_init_inode(result);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
-	result->i_mtime = result->i_atime =
-		inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+	inode_set_mtime_to_ts(result,
+			      inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
 	hpfs_i(result)->i_ea_size = 0;
 	result->i_mode = S_IFLNK | 0777;
 	result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 758a51564124..6b0ba3c1efba 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -725,10 +725,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	if (!de)
 		hpfs_error(s, "unable to find root dir");
 	else {
-		root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
-		root->i_atime.tv_nsec = 0;
-		root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
-		root->i_mtime.tv_nsec = 0;
+		inode_set_atime(root,
+				local_to_gmt(s, le32_to_cpu(de->read_date)),
+				0);
+		inode_set_mtime(root,
+				local_to_gmt(s, le32_to_cpu(de->write_date)),
+				0);
 		inode_set_ctime(root,
 				local_to_gmt(s, le32_to_cpu(de->creation_date)),
 				0);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316c4cebd3f3..f757d4f7ad98 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 	{}
 };
 
-#ifdef CONFIG_NUMA
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
-{
-	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
-							index);
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-	mpol_cond_put(vma->vm_policy);
-}
-#else
-static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
-					struct inode *inode, pgoff_t index)
-{
-}
-
-static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
-{
-}
-#endif
-
 /*
  * Mask used when checking the page offset value passed in via system
  * calls.  This value will be converted to a loff_t which is signed.
@@ -135,7 +112,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	ret = seal_check_future_write(info->seals, vma);
+	ret = seal_check_write(info->seals, vma);
 	if (ret)
 		return ret;
 
@@ -295,7 +272,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
 	size_t res = 0;
 
 	/* First subpage to start the loop. */
-	page += offset / PAGE_SIZE;
+	page = nth_page(page, offset / PAGE_SIZE);
 	offset %= PAGE_SIZE;
 	while (1) {
 		if (is_raw_hwpoison_page_in_hugepage(page))
@@ -309,7 +286,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt
 			break;
 		offset += n;
 		if (offset == PAGE_SIZE) {
-			page++;
+			page = nth_page(page, 1);
 			offset = 0;
 		}
 	}
@@ -334,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t retval = 0;
 
 	while (iov_iter_count(to)) {
-		struct page *page;
+		struct folio *folio;
 		size_t nr, copied, want;
 
 		/* nr is the maximum number of bytes to copy from this page */
@@ -352,18 +329,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		}
 		nr = nr - offset;
 
-		/* Find the page */
-		page = find_lock_page(mapping, index);
-		if (unlikely(page == NULL)) {
+		/* Find the folio */
+		folio = filemap_lock_hugetlb_folio(h, mapping, index);
+		if (IS_ERR(folio)) {
 			/*
 			 * We have a HOLE, zero out the user-buffer for the
 			 * length of the hole or request.
 			 */
 			copied = iov_iter_zero(nr, to);
 		} else {
-			unlock_page(page);
+			folio_unlock(folio);
 
-			if (!PageHWPoison(page))
+			if (!folio_test_has_hwpoisoned(folio))
 				want = nr;
 			else {
 				/*
@@ -371,19 +348,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				 * touching the 1st raw HWPOISON subpage after
 				 * offset.
 				 */
-				want = adjust_range_hwpoison(page, offset, nr);
+				want = adjust_range_hwpoison(&folio->page, offset, nr);
 				if (want == 0) {
-					put_page(page);
+					folio_put(folio);
 					retval = -EIO;
 					break;
 				}
 			}
 
 			/*
-			 * We have the page, copy it to user space buffer.
+			 * We have the folio, copy it to user space buffer.
 			 */
-			copied = copy_page_to_iter(page, offset, want, to);
-			put_page(page);
+			copied = copy_folio_to_iter(folio, offset, want, to);
+			folio_put(folio);
 		}
 		offset += copied;
 		retval += copied;
@@ -661,21 +638,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 {
 	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> huge_page_shift(h);
-	const pgoff_t end = lend >> huge_page_shift(h);
+	const pgoff_t end = lend >> PAGE_SHIFT;
 	struct folio_batch fbatch;
 	pgoff_t next, index;
 	int i, freed = 0;
 	bool truncate_op = (lend == LLONG_MAX);
 
 	folio_batch_init(&fbatch);
-	next = start;
+	next = lstart >> PAGE_SHIFT;
 	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
 		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
 			struct folio *folio = fbatch.folios[i];
 			u32 hash = 0;
 
-			index = folio->index;
+			index = folio->index >> huge_page_order(h);
 			hash = hugetlb_fault_mutex_hash(mapping, index);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
@@ -693,7 +669,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	}
 
 	if (truncate_op)
-		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
+		(void)hugetlb_unreserve_pages(inode,
+				lstart >> huge_page_shift(h),
+				LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
@@ -741,7 +719,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h,
 	pgoff_t idx = start >> huge_page_shift(h);
 	struct folio *folio;
 
-	folio = filemap_lock_folio(mapping, idx);
+	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
 	if (IS_ERR(folio))
 		return;
 
@@ -852,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
 	/*
 	 * Initialize a pseudo vma as this is required by the huge page
-	 * allocation routines.  If NUMA is configured, use page index
-	 * as input to create an allocation policy.
+	 * allocation routines.
 	 */
 	vma_init(&pseudo_vma, mm);
 	vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
@@ -886,7 +863,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		/* See if already present in mapping to avoid alloc/free */
-		folio = filemap_get_folio(mapping, index);
+		folio = filemap_get_folio(mapping, index << huge_page_order(h));
 		if (!IS_ERR(folio)) {
 			folio_put(folio);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -901,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		 * folios in these areas, we need to consume the reserves
 		 * to keep reservation accounting consistent.
 		 */
-		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
 		folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0);
-		hugetlb_drop_vma_policy(&pseudo_vma);
 		if (IS_ERR(folio)) {
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			error = PTR_ERR(folio);
@@ -980,7 +955,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 		inode->i_mode = S_IFDIR | ctx->mode;
 		inode->i_uid = ctx->uid;
 		inode->i_gid = ctx->gid;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -1024,7 +999,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_mapping->private_data = resv_map;
 		info->seals = F_SEAL_SEAL;
 		switch (mode & S_IFMT) {
@@ -1067,7 +1042,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
 	if (!inode)
 		return -ENOSPC;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	d_instantiate(dentry, inode);
 	dget(dentry);/* Extra count - pin the dentry in core */
 	return 0;
@@ -1099,7 +1074,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
 	if (!inode)
 		return -ENOSPC;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	d_tmpfile(file, inode);
 	return finish_open_simple(file, 0);
 }
@@ -1121,7 +1096,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
 		} else
 			iput(inode);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	return error;
 }
@@ -1204,7 +1179,9 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 	struct hstate *h = hstate_inode(d_inode(dentry));
+	u64 id = huge_encode_dev(dentry->d_sb->s_dev);
 
+	buf->f_fsid = u64_to_fsid(id);
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
@@ -1282,18 +1259,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-
-	/*
-	 * Any time after allocation, hugetlbfs_destroy_inode can be called
-	 * for the inode.  mpol_free_shared_policy is unconditionally called
-	 * as part of hugetlbfs_destroy_inode.  So, initialize policy here
-	 * in case of a quick call to destroy.
-	 *
-	 * Note that the policy is initialized even if we are creating a
-	 * private inode.  This simplifies hugetlbfs_destroy_inode.
-	 */
-	mpol_shared_policy_init(&p->policy, NULL);
-
 	return &p->vfs_inode;
 }
 
@@ -1305,7 +1270,6 @@ static void hugetlbfs_free_inode(struct inode *inode)
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
-	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
 }
 
 static const struct address_space_operations hugetlbfs_aops = {
diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
 		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..f238d987dec9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -54,9 +54,9 @@
  *   inode_hash_lock
  */
 
-static unsigned int i_hash_mask __read_mostly;
-static unsigned int i_hash_shift __read_mostly;
-static struct hlist_head *inode_hashtable __read_mostly;
+static unsigned int i_hash_mask __ro_after_init;
+static unsigned int i_hash_shift __ro_after_init;
+static struct hlist_head *inode_hashtable __ro_after_init;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 /*
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops);
 static DEFINE_PER_CPU(unsigned long, nr_inodes);
 static DEFINE_PER_CPU(unsigned long, nr_unused);
 
-static struct kmem_cache *inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __ro_after_init;
 
 static long get_nr_inodes(void)
 {
@@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	lockdep_set_class_and_name(&mapping->invalidate_lock,
 				   &sb->s_type->invalidate_lock_key,
 				   "mapping.invalidate_lock");
+	if (sb->s_iflags & SB_I_STABLE_WRITES)
+		mapping_set_stable_writes(mapping);
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
@@ -1837,27 +1839,29 @@ EXPORT_SYMBOL(bmap);
 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 			     struct timespec64 now)
 {
-	struct timespec64 ctime;
+	struct timespec64 atime, mtime, ctime;
 
 	if (!(mnt->mnt_flags & MNT_RELATIME))
 		return 1;
 	/*
 	 * Is mtime younger than or equal to atime? If yes, update atime:
 	 */
-	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+	atime = inode_get_atime(inode);
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&mtime, &atime) >= 0)
 		return 1;
 	/*
 	 * Is ctime younger than or equal to atime? If yes, update atime:
 	 */
 	ctime = inode_get_ctime(inode);
-	if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+	if (timespec64_compare(&ctime, &atime) >= 0)
 		return 1;
 
 	/*
 	 * Is the previous atime value older than a day? If yes,
 	 * update atime:
 	 */
-	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+	if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
@@ -1888,12 +1892,13 @@ int inode_update_timestamps(struct inode *inode, int flags)
 
 	if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
 		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 mtime = inode_get_mtime(inode);
 
 		now = inode_set_ctime_current(inode);
 		if (!timespec64_equal(&now, &ctime))
 			updated |= S_CTIME;
-		if (!timespec64_equal(&now, &inode->i_mtime)) {
-			inode->i_mtime = now;
+		if (!timespec64_equal(&now, &mtime)) {
+			inode_set_mtime_to_ts(inode, now);
 			updated |= S_MTIME;
 		}
 		if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
@@ -1903,8 +1908,10 @@ int inode_update_timestamps(struct inode *inode, int flags)
 	}
 
 	if (flags & S_ATIME) {
-		if (!timespec64_equal(&now, &inode->i_atime)) {
-			inode->i_atime = now;
+		struct timespec64 atime = inode_get_atime(inode);
+
+		if (!timespec64_equal(&now, &atime)) {
+			inode_set_atime_to_ts(inode, now);
 			updated |= S_ATIME;
 		}
 	}
@@ -1963,7 +1970,7 @@ EXPORT_SYMBOL(inode_update_time);
 bool atime_needs_update(const struct path *path, struct inode *inode)
 {
 	struct vfsmount *mnt = path->mnt;
-	struct timespec64 now;
+	struct timespec64 now, atime;
 
 	if (inode->i_flags & S_NOATIME)
 		return false;
@@ -1989,7 +1996,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 	if (!relatime_need_update(mnt, inode, now))
 		return false;
 
-	if (timespec64_equal(&inode->i_atime, &now))
+	atime = inode_get_atime(inode);
+	if (timespec64_equal(&atime, &now))
 		return false;
 
 	return true;
@@ -2006,7 +2014,7 @@ void touch_atime(const struct path *path)
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt) != 0)
+	if (mnt_get_write_access(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -2018,7 +2026,7 @@ void touch_atime(const struct path *path)
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
 	inode_update_time(inode, S_ATIME);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 skip_update:
 	sb_end_write(inode->i_sb);
 }
@@ -2106,17 +2114,18 @@ static int inode_needs_update_time(struct inode *inode)
 {
 	int sync_it = 0;
 	struct timespec64 now = current_time(inode);
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
 		return 0;
 
-	if (!timespec64_equal(&inode->i_mtime, &now))
+	ts = inode_get_mtime(inode);
+	if (!timespec64_equal(&ts, &now))
 		sync_it = S_MTIME;
 
-	ctime = inode_get_ctime(inode);
-	if (!timespec64_equal(&ctime, &now))
+	ts = inode_get_ctime(inode);
+	if (!timespec64_equal(&ts, &now))
 		sync_it |= S_CTIME;
 
 	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2131,9 +2140,9 @@ static int __file_update_time(struct file *file, int sync_mode)
 	struct inode *inode = file_inode(file);
 
 	/* try to update time settings */
-	if (!__mnt_want_write_file(file)) {
+	if (!mnt_get_write_access_file(file)) {
 		ret = inode_update_time(inode, sync_mode);
-		__mnt_drop_write_file(file);
+		mnt_put_write_access_file(file);
 	}
 
 	return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
 
 extern void dissolve_on_fput(struct vfsmount *);
 extern bool may_mount(void);
@@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
 struct file *alloc_empty_file(int flags, const struct cred *cred);
 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
+
+static inline void file_put_write_access(struct file *file)
+{
+	put_write_access(file->f_inode);
+	mnt_put_write_access(file->f_path.mnt);
+	if (unlikely(file->f_mode & FMODE_BACKING))
+		mnt_put_write_access(backing_file_user_path(file)->mnt);
+}
 
 static inline void put_file_access(struct file *file)
 {
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
-		put_write_access(file->f_inode);
-		__mnt_drop_write(file->f_path.mnt);
+		file_put_write_access(file);
 	}
 }
 
@@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
 	 * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
 	 * cleared, it will see s_readonly_remount set.
 	 * For RW->RO transition, the barrier pairs with the barrier in
-	 * __mnt_want_write() before the mnt_is_readonly() check. The barrier
-	 * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
-	 * cleared, it will see s_readonly_remount set.
+	 * mnt_get_write_access() before the mnt_is_readonly() check.
+	 * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+	 * already cleared, it will see s_readonly_remount set.
 	 */
 	smp_wmb();
 }
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2bc0aa23fde3..f72df2babe56 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -29,9 +29,9 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
  * and I/O completions.
  */
 struct iomap_folio_state {
-	atomic_t		read_bytes_pending;
-	atomic_t		write_bytes_pending;
 	spinlock_t		state_lock;
+	unsigned int		read_bytes_pending;
+	atomic_t		write_bytes_pending;
 
 	/*
 	 * Each block has two bits in this bitmap:
@@ -57,30 +57,32 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
 	return test_bit(block, ifs->state);
 }
 
-static void ifs_set_range_uptodate(struct folio *folio,
+static bool ifs_set_range_uptodate(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int first_blk = off >> inode->i_blkbits;
 	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
 	unsigned int nr_blks = last_blk - first_blk + 1;
-	unsigned long flags;
 
-	spin_lock_irqsave(&ifs->state_lock, flags);
 	bitmap_set(ifs->state, first_blk, nr_blks);
-	if (ifs_is_fully_uptodate(folio, ifs))
-		folio_mark_uptodate(folio);
-	spin_unlock_irqrestore(&ifs->state_lock, flags);
+	return ifs_is_fully_uptodate(folio, ifs);
 }
 
 static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 		size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
+	unsigned long flags;
+	bool uptodate = true;
 
-	if (ifs)
-		ifs_set_range_uptodate(folio, ifs, off, len);
-	else
+	if (ifs) {
+		spin_lock_irqsave(&ifs->state_lock, flags);
+		uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+		spin_unlock_irqrestore(&ifs->state_lock, flags);
+	}
+
+	if (uptodate)
 		folio_mark_uptodate(folio);
 }
 
@@ -181,7 +183,7 @@ static void ifs_free(struct folio *folio)
 
 	if (!ifs)
 		return;
-	WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
+	WARN_ON_ONCE(ifs->read_bytes_pending != 0);
 	WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
 	WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
 			folio_test_uptodate(folio));
@@ -248,20 +250,28 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
 	*lenp = plen;
 }
 
-static void iomap_finish_folio_read(struct folio *folio, size_t offset,
+static void iomap_finish_folio_read(struct folio *folio, size_t off,
 		size_t len, int error)
 {
 	struct iomap_folio_state *ifs = folio->private;
+	bool uptodate = !error;
+	bool finished = true;
 
-	if (unlikely(error)) {
-		folio_clear_uptodate(folio);
-		folio_set_error(folio);
-	} else {
-		iomap_set_range_uptodate(folio, offset, len);
+	if (ifs) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ifs->state_lock, flags);
+		if (!error)
+			uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+		ifs->read_bytes_pending -= len;
+		finished = !ifs->read_bytes_pending;
+		spin_unlock_irqrestore(&ifs->state_lock, flags);
 	}
 
-	if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
-		folio_unlock(folio);
+	if (error)
+		folio_set_error(folio);
+	if (finished)
+		folio_end_read(folio, uptodate);
 }
 
 static void iomap_read_end_io(struct bio *bio)
@@ -358,8 +368,11 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 	}
 
 	ctx->cur_folio_in_bio = true;
-	if (ifs)
-		atomic_add(plen, &ifs->read_bytes_pending);
+	if (ifs) {
+		spin_lock_irq(&ifs->state_lock);
+		ifs->read_bytes_pending += plen;
+		spin_unlock_irq(&ifs->state_lock);
+	}
 
 	sector = iomap_sector(iomap, pos);
 	if (!ctx->bio ||
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2ee21286ac8f..3e4d53e26f94 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,8 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
 			inode->i_ino, de->flags[-high_sierra]);
 	}
 #endif
-	inode->i_mtime = inode->i_atime =
-		inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
 
 	ei->i_first_extent = (isonum_733(de->extent) +
 			isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 348783a70f57..d6c17ad69dee 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -426,16 +426,14 @@ repeat:
 						0);
 			}
 			if (rr->u.TF.flags & TF_MODIFY) {
-				inode->i_mtime.tv_sec =
-				    iso_date(rr->u.TF.times[cnt++].time,
-					     0);
-				inode->i_mtime.tv_nsec = 0;
+				inode_set_mtime(inode,
+						iso_date(rr->u.TF.times[cnt++].time, 0),
+						0);
 			}
 			if (rr->u.TF.flags & TF_ACCESS) {
-				inode->i_atime.tv_sec =
-				    iso_date(rr->u.TF.times[cnt++].time,
-					     0);
-				inode->i_atime.tv_nsec = 0;
+				inode_set_atime(inode,
+						iso_date(rr->u.TF.times[cnt++].time, 0),
+						0);
 			}
 			if (rr->u.TF.flags & TF_ATTRIBUTES) {
 				inode_set_ctime(inode,
@@ -531,9 +529,9 @@ repeat:
 			inode->i_rdev = reloc->i_rdev;
 			inode->i_size = reloc->i_size;
 			inode->i_blocks = reloc->i_blocks;
-			inode->i_atime = reloc->i_atime;
+			inode_set_atime_to_ts(inode, inode_get_atime(reloc));
 			inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
-			inode->i_mtime = reloc->i_mtime;
+			inode_set_mtime_to_ts(inode, inode_get_mtime(reloc));
 			iput(reloc);
 			break;
 #ifdef CONFIG_ZISOFS
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8d6f934c3d95..5e122586e06e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	struct timespec64 now;
-	blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
+	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
 
 	*cbh = NULL;
 
@@ -270,6 +270,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 			if (!ret)
 				ret = err;
 		}
+		cond_resched();
 		spin_lock(&journal->j_list_lock);
 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 		smp_mb();
@@ -395,8 +396,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 */
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
-						journal->j_tail,
-						REQ_SYNC);
+						journal->j_tail, 0);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd2_debug(3, "superblock not updated\n");
@@ -715,6 +715,7 @@ start_journal_io:
 
 			for (i = 0; i < bufs; i++) {
 				struct buffer_head *bh = wbuf[i];
+
 				/*
 				 * Compute checksum.
 				 */
@@ -727,7 +728,8 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
+				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+					  bh);
 			}
 			cond_resched();
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 30dec2bd2ecc..206cb53ef2b0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1100,8 +1100,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
-					      REQ_SYNC | REQ_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1290,7 +1289,7 @@ static int jbd2_min_tag_size(void)
 static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
 					      struct shrink_control *sc)
 {
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	journal_t *journal = shrink->private_data;
 	unsigned long nr_to_scan = sc->nr_to_scan;
 	unsigned long nr_shrunk;
 	unsigned long count;
@@ -1316,7 +1315,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
 static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
 					       struct shrink_control *sc)
 {
-	journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+	journal_t *journal = shrink->private_data;
 	unsigned long count;
 
 	count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
@@ -1588,14 +1587,21 @@ static journal_t *journal_init_common(struct block_device *bdev,
 		goto err_cleanup;
 
 	journal->j_shrink_transaction = NULL;
-	journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
-	journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
-	journal->j_shrinker.seeks = DEFAULT_SEEKS;
-	journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-	err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
-				MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-	if (err)
+
+	journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
+					     MAJOR(bdev->bd_dev),
+					     MINOR(bdev->bd_dev));
+	if (!journal->j_shrinker) {
+		err = -ENOMEM;
 		goto err_cleanup;
+	}
+
+	journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
+	journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
+	journal->j_shrinker->batch = journal->j_max_transaction_buffers;
+	journal->j_shrinker->private_data = journal;
+
+	shrinker_register(journal->j_shrinker);
 
 	return journal;
 
@@ -1768,8 +1774,7 @@ static int journal_reset(journal_t *journal)
 		 */
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
-						journal->j_tail,
-						REQ_SYNC | REQ_FUA);
+						journal->j_tail, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1791,9 +1796,16 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
 		return -EIO;
 	}
 
-	trace_jbd2_write_superblock(journal, write_flags);
+	/*
+	 * Always set high priority flags to exempt from block layer's
+	 * QOS policies, e.g. writeback throttle.
+	 */
+	write_flags |= JBD2_JOURNAL_REQ_FLAGS;
 	if (!(journal->j_flags & JBD2_BARRIER))
 		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
+
+	trace_jbd2_write_superblock(journal, write_flags);
+
 	if (buffer_write_io_error(bh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the journal
@@ -2043,7 +2055,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
 	sb->s_errno    = cpu_to_be32(errcode);
 
-	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
+	jbd2_write_superblock(journal, REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -2164,17 +2176,16 @@ int jbd2_journal_destroy(journal_t *journal)
 				++journal->j_transaction_sequence;
 			write_unlock(&journal->j_state_lock);
 
-			jbd2_mark_journal_empty(journal,
-					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
+			jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
 		brelse(journal->j_sb_buffer);
 	}
 
-	if (journal->j_shrinker.flags & SHRINKER_REGISTERED) {
+	if (journal->j_shrinker) {
 		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
-		unregister_shrinker(&journal->j_shrinker);
+		shrinker_free(journal->j_shrinker);
 	}
 	if (journal->j_proc_entry)
 		jbd2_stats_proc_exit(journal);
@@ -2466,7 +2477,7 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+	jbd2_mark_journal_empty(journal, REQ_FUA);
 
 	if (flags)
 		err = __jbd2_journal_erase(journal, flags);
@@ -2512,7 +2523,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock_io(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+		jbd2_mark_journal_empty(journal, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index c269a7d29a46..01f744cb97a4 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,6 +289,8 @@ int jbd2_journal_recover(journal_t *journal)
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
+	errseq_t		wb_err;
+	struct address_space	*mapping;
 
 	memset(&info, 0, sizeof(info));
 	sb = journal->j_superblock;
@@ -306,6 +308,9 @@ int jbd2_journal_recover(journal_t *journal)
 		return 0;
 	}
 
+	wb_err = 0;
+	mapping = journal->j_fs_dev->bd_inode->i_mapping;
+	errseq_check_and_advance(&mapping->wb_err, &wb_err);
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -329,6 +334,9 @@ int jbd2_journal_recover(journal_t *journal)
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
+	err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
+	if (!err)
+		err = err2;
 	/* Make sure all replayed data is on permanent storage */
 	if (journal->j_flags & JBD2_BARRIER) {
 		err2 = blkdev_issue_flush(journal->j_fs_dev);
@@ -632,7 +640,7 @@ static int do_one_pass(journal_t *journal,
 					success = err;
 					printk(KERN_ERR
 						"JBD2: IO error %d recovering "
-						"block %ld in log\n",
+						"block %lu in log\n",
 						err, io_block);
 				} else {
 					unsigned long long blocknr;
@@ -661,7 +669,8 @@ static int do_one_pass(journal_t *journal,
 						printk(KERN_ERR "JBD2: Invalid "
 						       "checksum recovering "
 						       "data block %llu in "
-						       "log\n", blocknr);
+						       "journal block %lu\n",
+						       blocknr, io_block);
 						block_error = 1;
 						goto skip_write;
 					}
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 091ab0eaabbe..2b2938970da3 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,8 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
 	if (ret)
 		goto fail;
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(ri->ctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime))));
 
 	jffs2_free_raw_inode(ri);
 
@@ -238,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
 	if (dead_f->inocache)
 		set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
 	if (!ret)
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 	return ret;
 }
 /***********************************************************************/
@@ -272,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
 		set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
 		mutex_unlock(&f->sem);
 		d_instantiate(dentry, d_inode(old_dentry));
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 		ihold(d_inode(old_dentry));
 	}
 	return ret;
@@ -423,8 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -568,8 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
@@ -610,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
 			      dentry->d_name.len, f, now);
 	if (!ret) {
-		dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+		inode_set_mtime_to_ts(dir_i,
+				      inode_set_ctime_to_ts(dir_i, ITIME(now)));
 		clear_nlink(d_inode(dentry));
 		drop_nlink(dir_i);
 	}
@@ -746,8 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
 		goto fail;
 	}
 
-	dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
-					       ITIME(je32_to_cpu(rd->mctime)));
+	inode_set_mtime_to_ts(dir_i,
+			      inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
 
 	jffs2_free_raw_dirent(rd);
 
@@ -868,16 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap,
 		 * caller won't do it on its own since we are returning an error.
 		 */
 		d_invalidate(new_dentry);
-		new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
-							   ITIME(now));
+		inode_set_mtime_to_ts(new_dir_i,
+				      inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
 		return ret;
 	}
 
 	if (d_is_dir(old_dentry))
 		drop_nlink(old_dir_i);
 
-	old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
-	new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
+	inode_set_mtime_to_ts(old_dir_i,
+			      inode_set_ctime_to_ts(old_dir_i, ITIME(now)));
+	inode_set_mtime_to_ts(new_dir_i,
+			      inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
 
 	return 0;
 }
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 11c66793960e..62ea76da7fdf 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,8 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 			inode->i_size = pos + writtenlen;
 			inode->i_blocks = (inode->i_size + 511) >> 9;
 
-			inode->i_mtime = inode_set_ctime_to_ts(inode,
-							       ITIME(je32_to_cpu(ri->ctime)));
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))));
 		}
 	}
 
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 0403efab4089..d175cccb7c55 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -113,8 +113,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 
 	ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
-	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
-	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+	ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode)));
+	ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode)));
 	ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
 
 	ri->offset = cpu_to_je32(0);
@@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		return PTR_ERR(new_metadata);
 	}
 	/* It worked. Update the inode */
-	inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+	inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime)));
 	inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
-	inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+	inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime)));
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	i_uid_write(inode, je16_to_cpu(ri->uid));
 	i_gid_write(inode, je16_to_cpu(ri->gid));
@@ -282,8 +282,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
 	i_uid_write(inode, je16_to_cpu(latest_node.uid));
 	i_gid_write(inode, je16_to_cpu(latest_node.gid));
 	inode->i_size = je32_to_cpu(latest_node.isize);
-	inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
-	inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+	inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime)));
+	inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime)));
 	inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
 
 	set_nlink(inode, f->inocache->pino_nlink);
@@ -386,8 +386,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
 	iattr.ia_mode = inode->i_mode;
 	iattr.ia_uid = inode->i_uid;
 	iattr.ia_gid = inode->i_gid;
-	iattr.ia_atime = inode->i_atime;
-	iattr.ia_mtime = inode->i_mtime;
+	iattr.ia_atime = inode_get_atime(inode);
+	iattr.ia_mtime = inode_get_mtime(inode);
 	iattr.ia_ctime = inode_get_ctime(inode);
 
 	jffs2_do_setattr(inode, &iattr);
@@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
 	inode->i_mode = jemode_to_cpu(ri->mode);
 	i_gid_write(inode, je16_to_cpu(ri->gid));
 	i_uid_write(inode, je16_to_cpu(ri->uid));
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
-	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+	simple_inode_init_ts(inode);
+	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode)));
 
 	inode->i_blocks = 0;
 	inode->i_size = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 50727a1ff931..86ab014a349c 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -36,8 +36,8 @@ struct kvec;
 #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
 #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
 #define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
-#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
+#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f)))
 #define sleep_on_spinunlock(wq, s)				\
 	do {							\
 		DECLARE_WAITQUEUE(__wait, current);		\
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7ea37f49f1e1..f99591a634b4 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
 }
 
 static const struct export_operations jffs2_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.get_parent = jffs2_get_parent,
 	.fh_to_dentry = jffs2_fh_to_dentry,
 	.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3b6bdc9a49e1..00224f3a8d6e 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
  * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
  *   is an implementation of setxattr handler on jffs2.
  * -------------------------------------------------- */
-const struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler * const jffs2_xattr_handlers[] = {
 	&jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
 	&jffs2_security_xattr_handler,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 1b5030a3349d..7e7de093ec0a 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			     const char *buffer, size_t size, int flags);
 
-extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler * const jffs2_xattr_handlers[];
 extern const struct xattr_handler jffs2_user_xattr_handler;
 extern const struct xattr_handler jffs2_trusted_xattr_handler;
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 920d58a1566b..1a6b5921d17a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
 			break;
 		}
 
-		ip->i_mtime = inode_set_ctime_current(ip);
+		inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip));
 		mark_inode_dirty(ip);
 
 		txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 6b231d0d0071..603aae17a693 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -96,7 +96,7 @@ struct dinode {
 #define di_gengen	u._file._u1._imap._gengen
 
 			union {
-				xtpage_t _xtroot;
+				xtroot_t _xtroot;
 				struct {
 					u8 unused[16];	/* 16: */
 					dxd_t _dxd;	/* 16: */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 88afd108c2dd..11c77757ead9 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -87,7 +87,7 @@ static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
 static int dbFindBits(u32 word, int l2nb);
 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl);
 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		      int nblocks);
 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
@@ -180,7 +180,8 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
 
 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
-	if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) {
+	if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
+		bmp->db_l2nbperpage < 0) {
 		err = -EINVAL;
 		goto err_release_metapage;
 	}
@@ -194,6 +195,12 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+	if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
+		bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
+		err = -EINVAL;
+		goto err_release_metapage;
+	}
+
 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
 	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
@@ -1710,7 +1717,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 		 * dbFindLeaf() returns the index of the leaf at which
 		 * free space was found.
 		 */
-		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx, true);
 
 		/* release the buffer.
 		 */
@@ -1957,7 +1964,7 @@ dbAllocDmapLev(struct bmap * bmp,
 	 * free space.  if sufficient free space is found, dbFindLeaf()
 	 * returns the index of the leaf at which free space was found.
 	 */
-	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+	if (dbFindLeaf((dmtree_t *) &dp->tree, l2nb, &leafidx, false))
 		return -ENOSPC;
 
 	if (leafidx < 0)
@@ -2921,14 +2928,18 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
  *	leafidx	- return pointer to be set to the index of the leaf
  *		  describing at least l2nb free blocks if sufficient
  *		  free blocks are found.
+ *	is_ctl	- determines if the tree is of type ctl
  *
  * RETURN VALUES:
  *	0	- success
  *	-ENOSPC	- insufficient free blocks.
  */
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
 {
 	int ti, n = 0, k, x = 0;
+	int max_size;
+
+	max_size = is_ctl ? CTLTREESIZE : TREESIZE;
 
 	/* first check the root of the tree to see if there is
 	 * sufficient free space.
@@ -2949,6 +2960,8 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 			/* sufficient free space found.  move to the next
 			 * level (or quit if this is the last level).
 			 */
+			if (x + n > max_size)
+				return -ENOSPC;
 			if (l2nb <= tp->dmt_stree[x + n])
 				break;
 		}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 923a58422c46..a037ee59e398 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -670,7 +670,7 @@ int diWrite(tid_t tid, struct inode *ip)
 		 * This is the special xtree inside the directory for storing
 		 * the directory table
 		 */
-		xtpage_t *p, *xp;
+		xtroot_t *p, *xp;
 		xad_t *xad;
 
 		jfs_ip->xtlid = 0;
@@ -684,7 +684,7 @@ int diWrite(tid_t tid, struct inode *ip)
 		 * copy xtree root from inode to dinode:
 		 */
 		p = &jfs_ip->i_xtroot;
-		xp = (xtpage_t *) &dp->di_dirtable;
+		xp = (xtroot_t *) &dp->di_dirtable;
 		lv = ilinelock->lv;
 		for (n = 0; n < ilinelock->index; n++, lv++) {
 			memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
@@ -713,7 +713,7 @@ int diWrite(tid_t tid, struct inode *ip)
 	 *	regular file: 16 byte (XAD slot) granularity
 	 */
 	if (type & tlckXTREE) {
-		xtpage_t *p, *xp;
+		xtroot_t *p, *xp;
 		xad_t *xad;
 
 		/*
@@ -1320,7 +1320,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 {
 	int rc, ino, iagno, addext, extno, bitno, sword;
-	int nwords, rem, i, agno;
+	int nwords, rem, i, agno, dn_numag;
 	u32 mask, inosmap, extsmap;
 	struct inode *ipimap;
 	struct metapage *mp;
@@ -1356,6 +1356,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 
 	/* get the ag number of this iag */
 	agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
+	dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag;
+	if (agno < 0 || agno > dn_numag)
+		return -EIO;
 
 	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
 		/*
@@ -3061,10 +3064,10 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 	}
 
 	ip->i_size = le64_to_cpu(dip->di_size);
-	ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
-	ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
-	ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
-	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+	inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec),
+			le32_to_cpu(dip->di_atime.tv_nsec));
+	inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec),
+			le32_to_cpu(dip->di_mtime.tv_nsec));
 	inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
 			le32_to_cpu(dip->di_ctime.tv_nsec));
 	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
@@ -3138,12 +3141,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 	else /* Leave the original permissions alone */
 		dip->di_mode = cpu_to_le32(jfs_ip->mode2);
 
-	dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
-	dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
-	dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
-	dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
-	dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
-	dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+	dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip));
+	dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip));
+	dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip));
+	dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip));
+	dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip));
+	dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip));
 	dip->di_ixpxd = jfs_ip->ixpxd;	/* in-memory pxd's are little-endian */
 	dip->di_acl = jfs_ip->acl;	/* as are dxd's */
 	dip->di_ea = jfs_ip->ea;
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 721def69e732..dd4264aa9bed 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -66,7 +66,7 @@ struct jfs_inode_info {
 	lid_t	xtlid;		/* lid of xtree lock on directory */
 	union {
 		struct {
-			xtpage_t _xtroot;	/* 288: xtree root */
+			xtroot_t _xtroot;	/* 288: xtree root */
 			struct inomap *_imap;	/* 4: inode map header	*/
 		} file;
 		struct {
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 87594efa7f7c..f10f295d1502 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	jfs_inode->mode2 |= inode->i_mode;
 
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	jfs_inode->otime = inode_get_ctime(inode).tv_sec;
+	simple_inode_init_ts(inode);
+	jfs_inode->otime = inode_get_ctime_sec(inode);
 	inode->i_generation = JFS_SBI(sb)->gengen++;
 
 	jfs_inode->cflag = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e855b8fde76c..cb6d1fda66a7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 int lmLogOpen(struct super_block *sb)
 {
 	int rc;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	struct jfs_log *log;
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
 
 	mutex_lock(&jfs_log_mutex);
 	list_for_each_entry(log, &jfs_external_logs, journal_list) {
-		if (log->bdev->bd_dev == sbi->logdev) {
+		if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
 			if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
 				jfs_warn("wrong uuid on JFS journal");
 				mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				 log, NULL);
-	if (IS_ERR(bdev)) {
-		rc = PTR_ERR(bdev);
+	bdev_handle = bdev_open_by_dev(sbi->logdev,
+			BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
+	if (IS_ERR(bdev_handle)) {
+		rc = PTR_ERR(bdev_handle);
 		goto free;
 	}
 
-	log->bdev = bdev;
+	log->bdev_handle = bdev_handle;
 	uuid_copy(&log->uuid, &sbi->loguuid);
 
 	/*
@@ -1141,7 +1141,7 @@ journal_found:
 	lbmLogShutdown(log);
 
       close:		/* close external log device */
-	blkdev_put(bdev, log);
+	bdev_release(bdev_handle);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	set_bit(log_INLINELOG, &log->flag);
-	log->bdev = sb->s_bdev;
+	log->bdev_handle = sb->s_bdev_handle;
 	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
 	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
 	    (L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	int rc = 0;
 
 	jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
 	 *	external log as separate logical volume
 	 */
 	list_del(&log->journal_list);
-	bdev = log->bdev;
+	bdev_handle = log->bdev_handle;
 	rc = lmLogShutdown(log);
 
-	blkdev_put(bdev, log);
+	bdev_release(bdev_handle);
 
 	kfree(log);
 
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bp->l_flag |= lbmREAD;
 
-	bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
+	bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp)
 {
 	struct bio *bio;
 	struct jfs_log *log = bp->l_log;
+	struct block_device *bdev = NULL;
 
 	jfs_info("lbmStartIO");
 
-	bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+	if (!log->no_integrity)
+		bdev = log->bdev_handle->bdev;
+
+	bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
+			GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 805877ce5020..84aa2d253907 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
 				 *    before writing syncpt.
 				 */
 	struct list_head journal_list; /* Global list */
-	struct block_device *bdev; /* 4: log lv pointer */
+	struct bdev_handle *bdev_handle; /* 4: log lv pointer */
 	int serial;		/* 4: log mount serial number */
 
 	s64 base;		/* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index b83aae56a1f2..415eb65a36ff 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state)
 
 	if (state == FM_MOUNT) {
 		/* record log's dev_t and mount serial number */
-		j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+		j_sb->s_logdev = cpu_to_le32(
+			new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
 		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
 	} else if (state == FM_CLEAN) {
 		/*
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index ce4b4760fcb1..dccc8b3f1045 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -783,7 +783,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 			if (mp->xflag & COMMIT_PAGE)
 				p = (xtpage_t *) mp->data;
 			else
-				p = &jfs_ip->i_xtroot;
+				p = (xtpage_t *) &jfs_ip->i_xtroot;
 			xtlck->lwm.offset =
 			    le16_to_cpu(p->header.nextindex);
 		}
@@ -1676,7 +1676,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 
 	if (tlck->type & tlckBTROOT) {
 		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
-		p = &JFS_IP(ip)->i_xtroot;
+		p = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
 		if (S_ISDIR(ip->i_mode))
 			lrd->log.redopage.type |=
 			    cpu_to_le16(LOG_DIR_XTREE);
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 0d33816d251d..ec67d8554d2c 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
 extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 
-extern const struct xattr_handler *jfs_xattr_handlers[];
+extern const struct xattr_handler * const jfs_xattr_handlers[];
 
 #ifdef CONFIG_JFS_SECURITY
 extern int jfs_init_security(tid_t, struct inode *, struct inode *,
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 2d304cee884c..5ee618d17e77 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -1213,7 +1213,7 @@ xtSplitRoot(tid_t tid,
 	struct xtlock *xtlck;
 	int rc;
 
-	sp = &JFS_IP(ip)->i_xtroot;
+	sp = (xtpage_t *) &JFS_IP(ip)->i_xtroot;
 
 	INCREMENT(xtStat.split);
 
@@ -2098,7 +2098,7 @@ int xtAppend(tid_t tid,		/* transaction id */
  */
 void xtInitRoot(tid_t tid, struct inode *ip)
 {
-	xtpage_t *p;
+	xtroot_t *p;
 
 	/*
 	 * acquire a transaction lock on the root
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index ad7592191d76..0f6cf5a1ce75 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -65,24 +65,33 @@ struct xadlist {
 #define XTPAGEMAXSLOT	256
 #define XTENTRYSTART	2
 
-/*
- *	xtree page:
- */
-typedef union {
-	struct xtheader {
-		__le64 next;	/* 8: */
-		__le64 prev;	/* 8: */
+struct xtheader {
+	__le64 next;	/* 8: */
+	__le64 prev;	/* 8: */
 
-		u8 flag;	/* 1: */
-		u8 rsrvd1;	/* 1: */
-		__le16 nextindex;	/* 2: next index = number of entries */
-		__le16 maxentry;	/* 2: max number of entries */
-		__le16 rsrvd2;	/* 2: */
+	u8 flag;	/* 1: */
+	u8 rsrvd1;	/* 1: */
+	__le16 nextindex;	/* 2: next index = number of entries */
+	__le16 maxentry;	/* 2: max number of entries */
+	__le16 rsrvd2;	/* 2: */
 
-		pxd_t self;	/* 8: self */
-	} header;		/* (32) */
+	pxd_t self;	/* 8: self */
+};
 
+/*
+ *	xtree root (in inode):
+ */
+typedef union {
+	struct xtheader header;
 	xad_t xad[XTROOTMAXSLOT];	/* 16 * maxentry: xad array */
+} xtroot_t;
+
+/*
+ *	xtree page:
+ */
+typedef union {
+	struct xtheader header;
+	xad_t xad[XTPAGEMAXSLOT];	/* 16 * maxentry: xad array */
 } xtpage_t;
 
 /*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 57d7a4300210..d68a4e6ac345 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
 
 	mark_inode_dirty(ip);
 
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 
 	mark_inode_dirty(dip);
 
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
 
 	/* update parent directory inode */
 	inc_nlink(dip);		/* for '..' from child directory */
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	mark_inode_dirty(dip);
 
 	rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	/* update parent directory's link count corresponding
 	 * to ".." entry of the target directory deleted
 	 */
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	inode_dec_link_count(dip);
 
 	/*
@@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 
 	ASSERT(ip->i_nlink);
 
-	dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
+	inode_set_mtime_to_ts(dip,
+			      inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)));
 	mark_inode_dirty(dip);
 
 	/* update target's inode */
@@ -828,7 +829,7 @@ static int jfs_link(struct dentry *old_dentry,
 	/* update object inode */
 	inc_nlink(ip);		/* for new link */
 	inode_set_ctime_current(ip);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ihold(ip);
 
@@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
 
 	mark_inode_dirty(ip);
 
-	dip->i_mtime = inode_set_ctime_current(dip);
+	inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
 	mark_inode_dirty(dip);
 	/*
 	 * commit update of parent directory and link object
@@ -1271,7 +1272,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	inode_set_ctime_current(old_ip);
 	mark_inode_dirty(old_ip);
 
-	new_dir->i_mtime = inode_set_ctime_current(new_dir);
+	inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
 	mark_inode_dirty(new_dir);
 
 	/* Build list of inodes modified by this transaction */
@@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 
 	if (old_dir != new_dir) {
 		iplist[ipcount++] = new_dir;
-		old_dir->i_mtime = inode_set_ctime_current(old_dir);
+		inode_set_mtime_to_ts(old_dir,
+				      inode_set_ctime_current(old_dir));
 		mark_inode_dirty(old_dir);
 	}
 
@@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 
 	mark_inode_dirty(ip);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	mark_inode_dirty(dir);
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2e2f7f6d36a0..8d8e556bd610 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
 	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	inode_unlock(inode);
 	return len - towrite;
@@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = {
 };
 
 static const struct export_operations jfs_export_operations = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.fh_to_dentry	= jfs_fh_to_dentry,
 	.fh_to_parent	= jfs_fh_to_parent,
 	.get_parent	= jfs_get_parent,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 8577ad494e05..0fb7afac298e 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = {
 	.set = jfs_xattr_set,
 };
 
-const struct xattr_handler *jfs_xattr_handlers[] = {
+const struct xattr_handler * const jfs_xattr_handlers[] = {
 	&jfs_os2_xattr_handler,
 	&jfs_user_xattr_handler,
 	&jfs_security_xattr_handler,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 180906c36f51..f0cb729e9a97 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-#ifdef CONFIG_NUMA
-static int kernfs_vma_set_policy(struct vm_area_struct *vma,
-				 struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct kernfs_open_file *of = kernfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!kernfs_get_active(of->kn))
-		return -EINVAL;
-
-	ret = 0;
-	if (of->vm_ops->set_policy)
-		ret = of->vm_ops->set_policy(vma, new);
-
-	kernfs_put_active(of->kn);
-	return ret;
-}
-
-static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
-					       unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct kernfs_open_file *of = kernfs_of(file);
-	struct mempolicy *pol;
-
-	if (!of->vm_ops)
-		return vma->vm_policy;
-
-	if (!kernfs_get_active(of->kn))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (of->vm_ops->get_policy)
-		pol = of->vm_ops->get_policy(vma, addr);
-
-	kernfs_put_active(of->kn);
-	return pol;
-}
-
-#endif
-
 static const struct vm_operations_struct kernfs_vm_ops = {
 	.open		= kernfs_vma_open,
 	.fault		= kernfs_vma_fault,
 	.page_mkwrite	= kernfs_vma_page_mkwrite,
 	.access		= kernfs_vma_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= kernfs_vma_set_policy,
-	.get_policy	= kernfs_vma_get_policy,
-#endif
 };
 
 static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
@@ -903,6 +854,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 	return ret;
 }
 
+static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	loff_t ret;
+
+	/*
+	 * @of->mutex nests outside active ref and is primarily to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		mutex_unlock(&of->mutex);
+		return -ENODEV;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->llseek)
+		ret = ops->llseek(of, offset, whence);
+	else
+		ret = generic_file_llseek(file, offset, whence);
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+	return ret;
+}
+
 static void kernfs_notify_workfn(struct work_struct *work)
 {
 	struct kernfs_node *kn;
@@ -1005,7 +983,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify);
 const struct file_operations kernfs_file_fops = {
 	.read_iter	= kernfs_fop_read_iter,
 	.write_iter	= kernfs_fop_write_iter,
-	.llseek		= generic_file_llseek,
+	.llseek		= kernfs_fop_llseek,
 	.mmap		= kernfs_fop_mmap,
 	.open		= kernfs_fop_open,
 	.release	= kernfs_fop_release,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 922719a343a7..b83054da68b3 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,7 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 }
 
 static inline void set_inode_attr(struct inode *inode,
@@ -159,8 +159,8 @@ static inline void set_inode_attr(struct inode *inode,
 {
 	inode->i_uid = attrs->ia_uid;
 	inode->i_gid = attrs->ia_gid;
-	inode->i_atime = attrs->ia_atime;
-	inode->i_mtime = attrs->ia_mtime;
+	inode_set_atime_to_ts(inode, attrs->ia_atime);
+	inode_set_mtime_to_ts(inode, attrs->ia_mtime);
 	inode_set_ctime_to_ts(inode, attrs->ia_ctime);
 }
 
@@ -445,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = {
 	.set = kernfs_vfs_user_xattr_set,
 };
 
-const struct xattr_handler *kernfs_xattr_handlers[] = {
+const struct xattr_handler * const kernfs_xattr_handlers[] = {
 	&kernfs_trusted_xattr_handler,
 	&kernfs_security_xattr_handler,
 	&kernfs_user_xattr_handler,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a9b854cdfdb5..237f2764b941 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 /*
  * inode.c
  */
-extern const struct xattr_handler *kernfs_xattr_handlers[];
+extern const struct xattr_handler * const kernfs_xattr_handlers[];
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct mnt_idmap *idmap,
 			  struct inode *inode, int mask);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c4bf26142eec..4628edde2e7e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -21,8 +21,9 @@
 
 #include "kernfs-internal.h"
 
-struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-struct kernfs_global_locks *kernfs_locks;
+struct kmem_cache *kernfs_node_cache __ro_after_init;
+struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+struct kernfs_global_locks *kernfs_locks __ro_after_init;
 
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
@@ -265,7 +266,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
 	sb->s_time_gran = 1;
 
 	/* sysfs dentries and inodes don't require IO to create */
-	sb->s_shrink.seeks = 0;
+	sb->s_shrink->seeks = 0;
 
 	/* get root inode, initialize and unlock it */
 	down_read(&kf_root->kernfs_rwsem);
diff --git a/fs/libfs.c b/fs/libfs.c
index 37f2d34ee090..c2aa6fd4795c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -41,6 +41,9 @@ EXPORT_SYMBOL(simple_getattr);
 
 int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	u64 id = huge_encode_dev(dentry->d_sb->s_dev);
+
+	buf->f_fsid = u64_to_fsid(id);
 	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_SIZE;
 	buf->f_namelen = NAME_MAX;
@@ -396,6 +399,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 		return -EINVAL;
 	}
 
+	/* In this case, ->private_data is protected by f_pos_lock */
+	file->private_data = NULL;
 	return vfs_setpos(file, offset, U32_MAX);
 }
 
@@ -425,7 +430,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 {
 	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
 	XA_STATE(xas, &so_ctx->xa, ctx->pos);
@@ -434,7 +439,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 	while (true) {
 		dentry = offset_find_next(&xas);
 		if (!dentry)
-			break;
+			return ERR_PTR(-ENOENT);
 
 		if (!offset_dir_emit(ctx, dentry)) {
 			dput(dentry);
@@ -444,6 +449,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 		dput(dentry);
 		ctx->pos = xas.xa_index + 1;
 	}
+	return NULL;
 }
 
 /**
@@ -476,7 +482,12 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	offset_iterate_dir(d_inode(dir), ctx);
+	/* In this case, ->private_data is protected by f_pos_lock */
+	if (ctx->pos == 2)
+		file->private_data = NULL;
+	else if (file->private_data == ERR_PTR(-ENOENT))
+		return 0;
+	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
 	return 0;
 }
 
@@ -541,7 +552,8 @@ void simple_recursive_removal(struct dentry *dentry,
 				dput(victim);		// unpin it
 			}
 			if (victim == dentry) {
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 				if (d_is_dir(dentry))
 					drop_nlink(inode);
 				inode_unlock(inode);
@@ -582,7 +594,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	 */
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = inode_set_ctime_current(root);
+	simple_inode_init_ts(root);
 	s->s_root = d_make_root(root);
 	if (!s->s_root)
 		return -ENOMEM;
@@ -638,8 +650,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 {
 	struct inode *inode = d_inode(old_dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	inc_nlink(inode);
 	ihold(inode);
 	dget(dentry);
@@ -673,8 +685,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	drop_nlink(inode);
 	dput(dentry);
 	return 0;
@@ -709,9 +721,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct inode *newino = d_inode(new_dentry);
 
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 	if (new_dir != old_dir)
-		new_dir->i_mtime = inode_set_ctime_current(new_dir);
+		inode_set_mtime_to_ts(new_dir,
+				      inode_set_ctime_current(new_dir));
 	inode_set_ctime_current(d_inode(old_dentry));
 	if (newino)
 		inode_set_ctime_current(newino);
@@ -926,7 +939,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	 */
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
@@ -952,7 +965,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 			goto out;
 		}
 		inode->i_mode = S_IFREG | files->mode;
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inode->i_fop = files->ops;
 		inode->i_ino = i;
 		d_add(dentry, inode);
@@ -1308,6 +1321,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
 EXPORT_SYMBOL_GPL(simple_attr_write_signed);
 
 /**
+ * generic_encode_ino32_fh - generic export_operations->encode_fh function
+ * @inode:   the object to encode
+ * @fh:      where to store the file handle fragment
+ * @max_len: maximum length to store there (in 4 byte units)
+ * @parent:  parent directory inode, if wanted
+ *
+ * This generic encode_fh function assumes that the 32 inode number
+ * is suitable for locating an inode, and that the generation number
+ * can be used to check that it is still valid.  It places them in the
+ * filehandle fragment where export_decode_fh expects to find them.
+ */
+int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
+			    struct inode *parent)
+{
+	struct fid *fid = (void *)fh;
+	int len = *max_len;
+	int type = FILEID_INO32_GEN;
+
+	if (parent && (len < 4)) {
+		*max_len = 4;
+		return FILEID_INVALID;
+	} else if (len < 2) {
+		*max_len = 2;
+		return FILEID_INVALID;
+	}
+
+	len = 2;
+	fid->i32.ino = inode->i_ino;
+	fid->i32.gen = inode->i_generation;
+	if (parent) {
+		fid->i32.parent_ino = parent->i_ino;
+		fid->i32.parent_gen = parent->i_generation;
+		len = 4;
+		type = FILEID_INO32_GEN_PARENT;
+	}
+	*max_len = len;
+	return type;
+}
+EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);
+
+/**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
  * @fid:	file handle to convert
@@ -1520,7 +1574,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_flags |= S_PRIVATE;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	return inode;
 }
 EXPORT_SYMBOL(alloc_anon_inode);
@@ -1912,3 +1966,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
 	return direct_written + buffered_written;
 }
 EXPORT_SYMBOL_GPL(direct_write_fallback);
+
+/**
+ * simple_inode_init_ts - initialize the timestamps for a new inode
+ * @inode: inode to be initialized
+ *
+ * When a new inode is created, most filesystems set the timestamps to the
+ * current time. Add a helper to do this.
+ */
+struct timespec64 simple_inode_init_ts(struct inode *inode)
+{
+	struct timespec64 ts = inode_set_ctime_current(inode);
+
+	inode_set_atime_to_ts(inode, ts);
+	inode_set_mtime_to_ts(inode, ts);
+	return ts;
+}
+EXPORT_SYMBOL(simple_inode_init_ts);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6579948070a4..81be07c1d3d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
 #include <linux/uio.h>
 #include <linux/smp.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/inetdevice.h>
 
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
 	 * The main request loop. We don't terminate until the last
 	 * NFS mount or NFS daemon has gone away.
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nlm_max_connections;
 
-		nlmsvc_retry_blocked();
+		nlmsvc_retry_blocked(rqstp);
 		svc_recv(rqstp);
 	}
 	if (nlmsvc_ops)
@@ -373,7 +372,9 @@ static void lockd_put(void)
 	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
 #endif
 
+	svc_get(nlmsvc_serv);
 	svc_set_num_threads(nlmsvc_serv, NULL, 0);
+	svc_put(nlmsvc_serv);
 	timer_delete_sync(&nlmsvc_retry);
 	nlmsvc_serv = NULL;
 	dprintk("lockd_down: service destroyed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..2dc10900ad1c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/lockd/nlm.h>
 #include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
 #include <linux/exportfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVCLOCK
@@ -481,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	    struct nlm_host *host, struct nlm_lock *lock, int wait,
 	    struct nlm_cookie *cookie, int reclaim)
 {
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct inode		*inode = nlmsvc_file_inode(file);
-#endif
 	struct nlm_block	*block = NULL;
 	int			error;
 	int			mode;
@@ -497,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
-	if (nlmsvc_file_file(file)->f_op->lock) {
+	if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
 		async_block = wait;
 		wait = 0;
 	}
@@ -543,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	spin_lock(&nlm_blocked_lock);
+	/*
+	 * If this is a lock request for an already pending
+	 * lock request we return nlm_lck_blocked without calling
+	 * vfs_lock_file() again. Otherwise we have two pending
+	 * requests on the underlaying ->lock() implementation but
+	 * only one nlm_block to being granted by lm_grant().
+	 */
+	if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+	    !list_empty(&block->b_list)) {
+		spin_unlock(&nlm_blocked_lock);
+		ret = nlm_lck_blocked;
+		goto out;
+	}
+
+	/* Append to list of blocked */
+	nlmsvc_insert_block_locked(block, NLM_NEVER);
+	spin_unlock(&nlm_blocked_lock);
+
 	if (!wait)
 		lock->fl.fl_flags &= ~FL_SLEEP;
 	mode = lock_to_openmode(&lock->fl);
@@ -552,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	dprintk("lockd: vfs_lock_file returned %d\n", error);
 	switch (error) {
 		case 0:
+			nlmsvc_remove_block(block);
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
-			/*
-			 * If this is a blocking request for an
-			 * already pending lock request then we need
-			 * to put it back on lockd's block list
-			 */
-			if (wait)
-				break;
+			if (!wait)
+				nlmsvc_remove_block(block);
 			ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
 			goto out;
 		case FILE_LOCK_DEFERRED:
@@ -572,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			ret = nlmsvc_defer_lock_rqst(rqstp, block);
 			goto out;
 		case -EDEADLK:
+			nlmsvc_remove_block(block);
 			ret = nlm_deadlock;
 			goto out;
 		default:			/* includes ENOLCK */
+			nlmsvc_remove_block(block);
 			ret = nlm_lck_denied_nolocks;
 			goto out;
 	}
 
 	ret = nlm_lck_blocked;
-
-	/* Append to list of blocked */
-	nlmsvc_insert_block(block, NLM_NEVER);
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
@@ -1020,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block)
  * be retransmitted.
  */
 void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
 {
 	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
 	struct nlm_block *block;
 
 	spin_lock(&nlm_blocked_lock);
-	while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+	while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
 		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
 
 		if (block->b_when == NLM_NEVER)
diff --git a/fs/locks.c b/fs/locks.c
index 76ad05f8070a..46d88b9e222c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
-static struct kmem_cache *flctx_cache __read_mostly;
-static struct kmem_cache *filelock_cache __read_mostly;
+static struct kmem_cache *flctx_cache __ro_after_init;
+static struct kmem_cache *filelock_cache __ro_after_init;
 
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
@@ -2264,11 +2264,13 @@ out:
  * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
  * locks, the ->lock() interface may return asynchronously, before the lock has
  * been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
  * If the request is for non-blocking lock the file system should return
  * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
  * with the result. If the request timed out the callback routine will return a
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2a4b8b549e93..82aa7a35db26 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -37,7 +37,7 @@ struct mb_cache {
 	struct list_head	c_list;
 	/* Number of entries in cache */
 	unsigned long		c_entry_count;
-	struct shrinker		c_shrink;
+	struct shrinker		*c_shrink;
 	/* Work for shrinking when the cache has too many entries */
 	struct work_struct	c_shrink_work;
 };
@@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch);
 static unsigned long mb_cache_count(struct shrinker *shrink,
 				    struct shrink_control *sc)
 {
-	struct mb_cache *cache = container_of(shrink, struct mb_cache,
-					      c_shrink);
+	struct mb_cache *cache = shrink->private_data;
 
 	return cache->c_entry_count;
 }
@@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 static unsigned long mb_cache_scan(struct shrinker *shrink,
 				   struct shrink_control *sc)
 {
-	struct mb_cache *cache = container_of(shrink, struct mb_cache,
-					      c_shrink);
+	struct mb_cache *cache = shrink->private_data;
 	return mb_cache_shrink(cache, sc->nr_to_scan);
 }
 
@@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	for (i = 0; i < bucket_count; i++)
 		INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
 
-	cache->c_shrink.count_objects = mb_cache_count;
-	cache->c_shrink.scan_objects = mb_cache_scan;
-	cache->c_shrink.seeks = DEFAULT_SEEKS;
-	if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
+	cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker");
+	if (!cache->c_shrink) {
 		kfree(cache->c_hash);
 		kfree(cache);
 		goto err_out;
 	}
 
+	cache->c_shrink->count_objects = mb_cache_count;
+	cache->c_shrink->scan_objects = mb_cache_scan;
+	cache->c_shrink->private_data = cache;
+
+	shrinker_register(cache->c_shrink);
+
 	INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
 
 	return cache;
@@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache)
 {
 	struct mb_cache_entry *entry, *next;
 
-	unregister_shrinker(&cache->c_shrink);
+	shrinker_free(cache->c_shrink);
 
 	/*
 	 * We don't bother with any locking. Cache must not be used at this
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 25c08fbfcb9d..7da66ca184f4 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
 	}
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = j;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
 	insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 20f23e6e58ad..62c313fc9a49 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
 		de->inode = inode->i_ino;
 	}
 	dir_commit_chunk(page, pos, sbi->s_dirsize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	err = minix_handle_dirsync(dir);
 out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
 	else
 		de->inode = 0;
 	dir_commit_chunk(page, pos, len);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return minix_handle_dirsync(inode);
 }
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
 	else
 		de->inode = inode->i_ino;
 	dir_commit_chunk(page, pos, sbi->s_dirsize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	return minix_handle_dirsync(dir);
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df575473c1cc..f8af6c3ae336 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,7 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode)
 	i_gid_write(inode, raw_inode->i_gid);
 	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
-	inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0)));
 	inode->i_blocks = 0;
 	for (i = 0; i < 9; i++)
 		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -538,11 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
 	i_gid_write(inode, raw_inode->i_gid);
 	set_nlink(inode, raw_inode->i_nlinks);
 	inode->i_size = raw_inode->i_size;
-	inode->i_mtime.tv_sec = raw_inode->i_mtime;
-	inode->i_atime.tv_sec = raw_inode->i_atime;
+	inode_set_mtime(inode, raw_inode->i_mtime, 0);
+	inode_set_atime(inode, raw_inode->i_atime, 0);
 	inode_set_ctime(inode, raw_inode->i_ctime, 0);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_nsec = 0;
 	inode->i_blocks = 0;
 	for (i = 0; i < 10; i++)
 		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -589,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode)
 	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
 	raw_inode->i_nlinks = inode->i_nlink;
 	raw_inode->i_size = inode->i_size;
-	raw_inode->i_time = inode->i_mtime.tv_sec;
+	raw_inode->i_time = inode_get_mtime_sec(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
 	else for (i = 0; i < 9; i++)
@@ -616,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
 	raw_inode->i_nlinks = inode->i_nlink;
 	raw_inode->i_size = inode->i_size;
-	raw_inode->i_mtime = inode->i_mtime.tv_sec;
-	raw_inode->i_atime = inode->i_atime.tv_sec;
-	raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
+	raw_inode->i_mtime = inode_get_mtime_sec(inode);
+	raw_inode->i_atime = inode_get_atime_sec(inode);
+	raw_inode->i_ctime = inode_get_ctime_sec(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
 	else for (i = 0; i < 10; i++)
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index ce18ae37c29d..dad131e30c05 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -350,7 +350,7 @@ do_indirects:
 		}
 		first_whole++;
 	}
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 }
 
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 4905665c47d0..57d1dedf3f8f 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -256,6 +256,7 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
 
 	return idmap;
 }
+EXPORT_SYMBOL_GPL(mnt_idmap_get);
 
 /**
  * mnt_idmap_put - put a reference to an idmapping
@@ -271,3 +272,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
 		kfree(idmap);
 	}
 }
+EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/mpage.c b/fs/mpage.c
index 242e213ee064..ffb064ed9d04 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
 			folio_mark_uptodate(folio);
 			return;
 		}
-		create_empty_buffers(&folio->page, i_blocksize(inode), 0);
-		head = folio_buffers(folio);
+		head = create_empty_buffers(folio, i_blocksize(inode), 0);
 	}
 
 	page_bh = head;
diff --git a/fs/namei.c b/fs/namei.c
index 94565bd7e73f..71c13b2990b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3105,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 EXPORT_SYMBOL(unlock_rename);
 
 /**
- * mode_strip_umask - handle vfs umask stripping
- * @dir:	parent directory of the new inode
- * @mode:	mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
-	if (!IS_POSIXACL(dir))
-		mode &= ~current_umask();
-	return mode;
-}
-
-/**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
  * @dir:	parent directory of the new inode
@@ -3536,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 		if (likely(dentry))
 			goto finish_lookup;
 
-		BUG_ON(nd->flags & LOOKUP_RCU);
+		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+			return ERR_PTR(-ECHILD);
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
@@ -3803,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd,
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	fput(file);
+	if (unlikely(file->f_mode & FMODE_OPENED))
+		fput(file);
+	else
+		release_empty_file(file);
 	if (error == -EOPENSTALE) {
 		if (flags & LOOKUP_RCU)
 			error = -ECHILD;
@@ -4387,11 +4372,9 @@ retry_deleg:
 	if (!IS_ERR(dentry)) {
 
 		/* Why not before? Because we want correct error value */
-		if (last.name[last.len])
+		if (last.name[last.len] || d_is_negative(dentry))
 			goto slashes;
 		inode = dentry->d_inode;
-		if (d_is_negative(dentry))
-			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&path, dentry);
 		if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..fbf0e596fcd3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,10 +39,10 @@
 /* Maximum number of mounts in a mount namespace */
 static unsigned int sysctl_mount_max __read_mostly = 100000;
 
-static unsigned int m_hash_mask __read_mostly;
-static unsigned int m_hash_shift __read_mostly;
-static unsigned int mp_hash_mask __read_mostly;
-static unsigned int mp_hash_shift __read_mostly;
+static unsigned int m_hash_mask __ro_after_init;
+static unsigned int m_hash_shift __ro_after_init;
+static unsigned int mp_hash_mask __ro_after_init;
+static unsigned int mp_hash_shift __ro_after_init;
 
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
@@ -68,9 +68,9 @@ static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 
-static struct hlist_head *mount_hashtable __read_mostly;
-static struct hlist_head *mountpoint_hashtable __read_mostly;
-static struct kmem_cache *mnt_cache __read_mostly;
+static struct hlist_head *mount_hashtable __ro_after_init;
+static struct hlist_head *mountpoint_hashtable __ro_after_init;
+static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@ -86,7 +86,7 @@ struct mount_kattr {
 };
 
 /* /sys/fs */
-struct kobject *fs_kobj;
+struct kobject *fs_kobj __ro_after_init;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
 /*
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
  * can determine when writes are able to occur to a filesystem.
  */
 /**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
  * called. This is effectively a refcount.
  */
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
@@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
 
 /**
  * mnt_want_write - get write access to a mount
@@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m)
 	int ret;
 
 	sb_start_write(m->mnt_sb);
-	ret = __mnt_want_write(m);
+	ret = mnt_get_write_access(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
@@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m)
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
  * skips incrementing mnt_writers (since the open file already has a reference)
  * and instead only does the check for emergency r/o remounts.  This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
  */
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
 {
 	if (file->f_mode & FMODE_WRITER) {
 		/*
@@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file)
 			return -EROFS;
 		return 0;
 	}
-	return __mnt_want_write(file->f_path.mnt);
+	return mnt_get_write_access(file->f_path.mnt);
 }
 
 /**
@@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file)
 	int ret;
 
 	sb_start_write(file_inode(file)->i_sb);
-	ret = __mnt_want_write_file(file);
+	ret = mnt_get_write_access_file(file);
 	if (ret)
 		sb_end_write(file_inode(file)->i_sb);
 	return ret;
@@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file)
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
  */
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write_file(file);
+	mnt_put_write_access_file(file);
 	sb_end_write(file_inode(file)->i_sb);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
@@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		/* avoid cacheline pingpong */
 		if (unlikely(m->mnt_expiry_mark))
-			m->mnt_expiry_mark = 0;
+			WRITE_ONCE(m->mnt_expiry_mark, 0);
 		mntput_no_expire(m);
 	}
 }
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 7df2503cef6c..01ac733a6320 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -125,7 +125,7 @@ config PNFS_BLOCK
 
 config PNFS_FLEXFILE_LAYOUT
 	tristate
-	depends on NFS_V4_1 && NFS_V3
+	depends on NFS_V4_1
 	default NFS_V4
 
 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 716bc75e9ed2..b4294a8aa2d4 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
 	struct pnfs_block_dev		*children;
 	u64				chunk_size;
 
-	struct block_device		*bdev;
+	struct bdev_handle		*bdev_handle;
 	u64				disk_offset;
 
 	u64				pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 65cbb5607a5f..f318a05a80e1 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 	} else {
 		if (dev->pr_registered) {
 			const struct pr_ops *ops =
-				dev->bdev->bd_disk->fops->pr_ops;
+				dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
 			int error;
 
-			error = ops->pr_register(dev->bdev, dev->pr_key, 0,
-				false);
+			error = ops->pr_register(dev->bdev_handle->bdev,
+				dev->pr_key, 0, false);
 			if (error)
 				pr_err("failed to unregister PR key.\n");
 		}
 
-		if (dev->bdev)
-			blkdev_put(dev->bdev, NULL);
+		if (dev->bdev_handle)
+			bdev_release(dev->bdev_handle);
 	}
 }
 
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
 	map->start = dev->start;
 	map->len = dev->len;
 	map->disk_offset = dev->disk_offset;
-	map->bdev = dev->bdev;
+	map->bdev = dev->bdev_handle->bdev;
 	return true;
 }
 
@@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	dev_t dev;
 
 	dev = bl_resolve_deviceid(server, v, gfp_mask);
 	if (!dev)
 		return -EIO;
 
-	bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
-				 NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				       NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-			MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
-		return PTR_ERR(bdev);
+			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
+		return PTR_ERR(bdev_handle);
 	}
-	d->bdev = bdev;
-
-
-	d->len = bdev_nr_bytes(d->bdev);
+	d->bdev_handle = bdev_handle;
+	d->len = bdev_nr_bytes(bdev_handle->bdev);
 	d->map = bl_map_simple;
 
 	printk(KERN_INFO "pNFS: using block device %s\n",
-		d->bdev->bd_disk->disk_name);
+		bdev_handle->bdev->bd_disk->disk_name);
 	return 0;
 }
 
@@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
 	}
 }
 
-static struct block_device *
+static struct bdev_handle *
 bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 {
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	const char *devname;
 
 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 
-	bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
-				  NULL);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+					NULL, NULL);
+	if (IS_ERR(bdev_handle)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
-			devname, PTR_ERR(bdev));
+			devname, PTR_ERR(bdev_handle));
 	}
 
 	kfree(devname);
-	return bdev;
+	return bdev_handle;
 }
 
 static int
@@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct block_device *bdev;
+	struct bdev_handle *bdev_handle;
 	const struct pr_ops *ops;
 	int error;
 
@@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	 * On other distributions like Debian, the default SCSI by-id path will
 	 * point to the dm-multipath device if one exists.
 	 */
-	bdev = bl_open_path(v, "dm-uuid-mpath-0x");
-	if (IS_ERR(bdev))
-		bdev = bl_open_path(v, "wwn-0x");
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-	d->bdev = bdev;
-
-	d->len = bdev_nr_bytes(d->bdev);
+	bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
+	if (IS_ERR(bdev_handle))
+		bdev_handle = bl_open_path(v, "wwn-0x");
+	if (IS_ERR(bdev_handle))
+		return PTR_ERR(bdev_handle);
+	d->bdev_handle = bdev_handle;
+
+	d->len = bdev_nr_bytes(d->bdev_handle->bdev);
 	d->map = bl_map_simple;
 	d->pr_key = v->scsi.pr_key;
 
 	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
-		d->bdev->bd_disk->disk_name, d->pr_key);
+		d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
 
-	ops = d->bdev->bd_disk->fops->pr_ops;
+	ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
 	if (!ops) {
 		pr_err("pNFS: block device %s does not support reservations.",
-				d->bdev->bd_disk->disk_name);
+				d->bdev_handle->bdev->bd_disk->disk_name);
 		error = -EINVAL;
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+	error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
 	if (error) {
 		pr_err("pNFS: failed to register key for block device %s.",
-				d->bdev->bd_disk->disk_name);
+				d->bdev_handle->bdev->bd_disk->disk_name);
 		goto out_blkdev_put;
 	}
 
@@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 
 out_blkdev_put:
-	blkdev_put(d->bdev, NULL);
+	bdev_release(d->bdev_handle);
 	return error;
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_freezable_should_stop(NULL))
+	while (!svc_thread_should_stop(rqstp))
 		svc_recv(rqstp);
 
 	svc_exit_thread(rqstp);
@@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
-	struct svc_rqst *rqstp = vrqstp;
-	struct svc_serv *serv = rqstp->rq_server;
-	struct rpc_rqst *req;
-	int error;
-	DEFINE_WAIT(wq);
-
-	set_freezable();
-
-	while (!kthread_freezable_should_stop(NULL)) {
-		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
-		spin_lock_bh(&serv->sv_cb_lock);
-		if (!list_empty(&serv->sv_cb_list)) {
-			req = list_first_entry(&serv->sv_cb_list,
-					struct rpc_rqst, rq_bc_list);
-			list_del(&req->rq_bc_list);
-			spin_unlock_bh(&serv->sv_cb_lock);
-			finish_wait(&serv->sv_cb_waitq, &wq);
-			dprintk("Invoking bc_svc_process()\n");
-			error = bc_svc_process(serv, req, rqstp);
-			dprintk("bc_svc_process() returned w/ error code= %d\n",
-				error);
-		} else {
-			spin_unlock_bh(&serv->sv_cb_lock);
-			if (!kthread_should_stop())
-				schedule();
-			finish_wait(&serv->sv_cb_waitq, &wq);
-		}
-	}
-
-	svc_exit_thread(rqstp);
-	return 0;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 			cb_info->users);
 
 	threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
-	if (minorversion)
-		threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
 	if (minorversion)
 		return ERR_PTR(-ENOTSUPP);
 #endif
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6bed1394d748..96a4923080ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -60,7 +60,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
 	if (nfs_have_writebacks(inode))
 		res->change_attr++;
 	res->ctime = inode_get_ctime(inode);
-	res->mtime = inode->i_mtime;
+	res->mtime = inode_get_mtime(inode);
 	res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
 		args->bitmap[0];
 	res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cf7365581031..fa1a14def45c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -448,6 +448,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 	delegation->cred = get_cred(cred);
 	delegation->inode = inode;
 	delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+	delegation->test_gen = 0;
 	spin_lock_init(&delegation->lock);
 
 	spin_lock(&clp->cl_lock);
@@ -1294,6 +1295,8 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
 	struct inode *inode;
 	const struct cred *cred;
 	nfs4_stateid stateid;
+	unsigned long gen = ++server->delegation_gen;
+
 restart:
 	rcu_read_lock();
 restart_locked:
@@ -1303,7 +1306,8 @@ restart_locked:
 		    test_bit(NFS_DELEGATION_RETURNING,
 					&delegation->flags) ||
 		    test_bit(NFS_DELEGATION_TEST_EXPIRED,
-					&delegation->flags) == 0)
+					&delegation->flags) == 0 ||
+			delegation->test_gen == gen)
 			continue;
 		inode = nfs_delegation_grab_inode(delegation);
 		if (inode == NULL)
@@ -1312,6 +1316,7 @@ restart_locked:
 		cred = get_cred_rcu(delegation->cred);
 		nfs4_stateid_copy(&stateid, &delegation->stateid);
 		spin_unlock(&delegation->lock);
+		delegation->test_gen = gen;
 		clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
 		rcu_read_unlock();
 		nfs_delegation_test_free_expired(inode, &stateid, cred);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 1c378992b7c0..a6f495d012cf 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -21,6 +21,7 @@ struct nfs_delegation {
 	fmode_t type;
 	unsigned long pagemod_limit;
 	__u64 change_attr;
+	unsigned long test_gen;
 	unsigned long flags;
 	refcount_t refcount;
 	spinlock_t lock;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e6a51fd94fea..13dffe4201e6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2532,7 +2532,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
 int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		struct dentry *dentry, const char *symname)
 {
-	struct page *page;
+	struct folio *folio;
 	char *kaddr;
 	struct iattr attr;
 	unsigned int pathlen = strlen(symname);
@@ -2547,24 +2547,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	attr.ia_mode = S_IFLNK | S_IRWXUGO;
 	attr.ia_valid = ATTR_MODE;
 
-	page = alloc_page(GFP_USER);
-	if (!page)
+	folio = folio_alloc(GFP_USER, 0);
+	if (!folio)
 		return -ENOMEM;
 
-	kaddr = page_address(page);
+	kaddr = folio_address(folio);
 	memcpy(kaddr, symname, pathlen);
 	if (pathlen < PAGE_SIZE)
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
 
 	trace_nfs_symlink_enter(dir, dentry);
-	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
+	error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr);
 	trace_nfs_symlink_exit(dir, dentry, error);
 	if (error != 0) {
 		dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
 			dir->i_sb->s_id, dir->i_ino,
 			dentry, symname, error);
 		d_drop(dentry);
-		__free_page(page);
+		folio_put(folio);
 		return error;
 	}
 
@@ -2574,18 +2574,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	 * No big deal if we can't add this page to the page cache here.
 	 * READLINK will get the missing page from the server if needed.
 	 */
-	if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
-							GFP_KERNEL)) {
-		SetPageUptodate(page);
-		unlock_page(page);
-		/*
-		 * add_to_page_cache_lru() grabs an extra page refcount.
-		 * Drop it here to avoid leaking this page later.
-		 */
-		put_page(page);
-	} else
-		__free_page(page);
+	if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0,
+							GFP_KERNEL) == 0) {
+		folio_mark_uptodate(folio);
+		folio_unlock(folio);
+	}
 
+	folio_put(folio);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_symlink);
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index aed0748fd6ec..c7bb5da93307 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr {
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
-	struct nfs4_pnfs_ds		*ds_list[];
+	struct nfs4_pnfs_ds		*ds_list[] __counted_by(ds_num);
 };
 
 struct nfs4_filelayout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 354a031c69b1..f84b3fb0dddd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
 	u64				stripe_unit;
 	u32				flags;
 	u32				mirror_array_cnt;
-	struct nfs4_ff_layout_mirror	*mirror_array[];
+	struct nfs4_ff_layout_mirror	*mirror_array[] __counted_by(mirror_array_cnt);
 };
 
 struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2dc64454492b..5407ab8c8783 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -114,8 +114,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
 					      struct inode *inode)
 {
 	memset(auxdata, 0, sizeof(*auxdata));
-	auxdata->mtime_sec  = inode->i_mtime.tv_sec;
-	auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+	auxdata->mtime_sec  = inode_get_mtime(inode).tv_sec;
+	auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec;
 	auxdata->ctime_sec  = inode_get_ctime(inode).tv_sec;
 	auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e21c073158e5..ebb8d60e1152 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -512,8 +512,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		} else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
-		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
-		memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+		inode_set_atime(inode, 0, 0);
+		inode_set_mtime(inode, 0, 0);
 		inode_set_ctime(inode, 0, 0);
 		inode_set_iversion_raw(inode, 0);
 		inode->i_size = 0;
@@ -527,11 +527,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->read_cache_jiffies = fattr->time_start;
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
+			inode_set_atime_to_ts(inode, fattr->atime);
 		else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
+			inode_set_mtime_to_ts(inode, fattr->mtime);
 		else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
@@ -742,9 +742,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
 				| NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-			inode->i_atime = fattr->atime;
+			inode_set_atime_to_ts(inode, fattr->atime);
 		else if (attr->ia_valid & ATTR_ATIME_SET)
-			inode->i_atime = attr->ia_atime;
+			inode_set_atime_to_ts(inode, attr->ia_atime);
 		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 
@@ -758,9 +758,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
 				| NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-			inode->i_mtime = fattr->mtime;
+			inode_set_mtime_to_ts(inode, fattr->mtime);
 		else if (attr->ia_valid & ATTR_MTIME_SET)
-			inode->i_mtime = attr->ia_mtime;
+			inode_set_mtime_to_ts(inode, attr->ia_mtime);
 		else
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 
@@ -1451,11 +1451,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		inode_set_ctime_to_ts(inode, fattr->ctime);
 	}
 
-	ts = inode->i_mtime;
+	ts = inode_get_mtime(inode);
 	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			&& timespec64_equal(&ts, &fattr->pre_mtime)) {
-		inode->i_mtime = fattr->mtime;
+		inode_set_mtime_to_ts(inode, fattr->mtime);
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1506,7 +1506,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
 			invalid |= NFS_INO_INVALID_CHANGE;
 
-		ts = inode->i_mtime;
+		ts = inode_get_mtime(inode);
 		if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
 			invalid |= NFS_INO_INVALID_MTIME;
 
@@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
 		invalid |= NFS_INO_INVALID_NLINK;
 
-	ts = inode->i_atime;
+	ts = inode_get_atime(inode);
 	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
 		invalid |= NFS_INO_INVALID_ATIME;
 
@@ -2002,7 +2002,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
 			(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
-		fattr->pre_mtime = inode->i_mtime;
+		fattr->pre_mtime = inode_get_mtime(inode);
 		fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
@@ -2184,7 +2184,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-		inode->i_mtime = fattr->mtime;
+		inode_set_mtime_to_ts(inode, fattr->mtime);
 	else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_MTIME;
@@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			save_cache_validity & NFS_INO_INVALID_SIZE;
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-		inode->i_atime = fattr->atime;
+		inode_set_atime_to_ts(inode, fattr->atime);
 	else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_ATIME;
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
index 5ba00610aede..0d3ce0460e35 100644
--- a/fs/nfs/nfs.h
+++ b/fs/nfs/nfs.h
@@ -18,7 +18,7 @@ struct nfs_subversion {
 	const struct rpc_version *rpc_vers;	/* NFS version information */
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS operations */
 	const struct super_operations *sops;	/* NFS Super operations */
-	const struct xattr_handler **xattr;	/* NFS xattr handlers */
+	const struct xattr_handler * const *xattr;	/* NFS xattr handlers */
 	struct list_head list;		/* List of NFS versions */
 };
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 4bf208a0a8e9..2de66e4e8280 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -543,9 +543,10 @@ out:
 }
 
 static int
-nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
 		  unsigned int len, struct iattr *sattr)
 {
+	struct page *page = &folio->page;
 	struct nfs3_createdata *data;
 	struct dentry *d_alias;
 	int status = -ENOMEM;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 911f634ba3da..2ad66a8922f4 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink,
 static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink,
 					   struct shrink_control *sc);
 
-static struct shrinker nfs4_xattr_cache_shrinker = {
-	.count_objects	= nfs4_xattr_cache_count,
-	.scan_objects	= nfs4_xattr_cache_scan,
-	.seeks		= DEFAULT_SEEKS,
-	.flags		= SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_entry_shrinker = {
-	.count_objects	= nfs4_xattr_entry_count,
-	.scan_objects	= nfs4_xattr_entry_scan,
-	.seeks		= DEFAULT_SEEKS,
-	.batch		= 512,
-	.flags		= SHRINKER_MEMCG_AWARE,
-};
-
-static struct shrinker nfs4_xattr_large_entry_shrinker = {
-	.count_objects	= nfs4_xattr_entry_count,
-	.scan_objects	= nfs4_xattr_entry_scan,
-	.seeks		= 1,
-	.batch		= 512,
-	.flags		= SHRINKER_MEMCG_AWARE,
-};
+static struct shrinker *nfs4_xattr_cache_shrinker;
+static struct shrinker *nfs4_xattr_entry_shrinker;
+static struct shrinker *nfs4_xattr_large_entry_shrinker;
 
 static enum lru_status
 cache_lru_isolate(struct list_head *item,
@@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc)
 	struct nfs4_xattr_entry *entry;
 	struct list_lru *lru;
 
-	lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+	lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
 	    &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
 
 	freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose);
@@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc)
 	unsigned long count;
 	struct list_lru *lru;
 
-	lru = (shrink == &nfs4_xattr_large_entry_shrinker) ?
+	lru = (shrink == nfs4_xattr_large_entry_shrinker) ?
 	    &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
 
 	count = list_lru_shrink_count(lru, sc);
@@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p)
 	INIT_LIST_HEAD(&cache->dispose);
 }
 
-static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
-				    struct list_lru *lru, const char *name)
+typedef unsigned long (*count_objects_cb)(struct shrinker *s,
+					  struct shrink_control *sc);
+typedef unsigned long (*scan_objects_cb)(struct shrinker *s,
+					 struct shrink_control *sc);
+
+static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker,
+					   struct list_lru *lru, const char *name,
+					   count_objects_cb count,
+					   scan_objects_cb scan, long batch, int seeks)
 {
-	int ret = 0;
+	int ret;
 
-	ret = register_shrinker(shrinker, name);
-	if (ret)
+	*shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name);
+	if (!*shrinker)
+		return -ENOMEM;
+
+	ret = list_lru_init_memcg(lru, *shrinker);
+	if (ret) {
+		shrinker_free(*shrinker);
 		return ret;
+	}
 
-	ret = list_lru_init_memcg(lru, shrinker);
-	if (ret)
-		unregister_shrinker(shrinker);
+	(*shrinker)->count_objects = count;
+	(*shrinker)->scan_objects = scan;
+	(*shrinker)->batch = batch;
+	(*shrinker)->seeks = seeks;
+
+	shrinker_register(*shrinker);
 
 	return ret;
 }
@@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker,
 static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker,
 					struct list_lru *lru)
 {
-	unregister_shrinker(shrinker);
+	shrinker_free(shrinker);
 	list_lru_destroy(lru);
 }
 
@@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void)
 		return -ENOMEM;
 
 	ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker,
-				       &nfs4_xattr_cache_lru,
-				       "nfs-xattr_cache");
+				       &nfs4_xattr_cache_lru, "nfs-xattr_cache",
+				       nfs4_xattr_cache_count,
+				       nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS);
 	if (ret)
 		goto out1;
 
 	ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker,
-				       &nfs4_xattr_entry_lru,
-				       "nfs-xattr_entry");
+				       &nfs4_xattr_entry_lru, "nfs-xattr_entry",
+				       nfs4_xattr_entry_count,
+				       nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS);
 	if (ret)
 		goto out2;
 
 	ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker,
 				       &nfs4_xattr_large_entry_lru,
-				       "nfs-xattr_large_entry");
+				       "nfs-xattr_large_entry",
+				       nfs4_xattr_entry_count,
+				       nfs4_xattr_entry_scan, 512, 1);
 	if (!ret)
 		return 0;
 
-	nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+	nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
 				    &nfs4_xattr_entry_lru);
 out2:
-	nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+	nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
 				    &nfs4_xattr_cache_lru);
 out1:
 	kmem_cache_destroy(nfs4_xattr_cache_cachep);
@@ -1056,11 +1057,11 @@ out1:
 
 void nfs4_xattr_cache_exit(void)
 {
-	nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker,
+	nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker,
 				    &nfs4_xattr_large_entry_lru);
-	nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker,
+	nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker,
 				    &nfs4_xattr_entry_lru);
-	nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker,
+	nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker,
 				    &nfs4_xattr_cache_lru);
 	kmem_cache_destroy(nfs4_xattr_cache_cachep);
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 47c5c1f86d66..581698f1b7b2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -209,6 +209,7 @@ struct nfs4_exception {
 	struct inode *inode;
 	nfs4_stateid *stateid;
 	long timeout;
+	unsigned short retrans;
 	unsigned char task_is_privileged : 1;
 	unsigned char delay : 1,
 		      recovering : 1,
@@ -315,7 +316,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
 						    struct nfs_fh *,
 						    struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern const struct xattr_handler * const nfs4_xattr_handlers[];
 extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
 		const struct nfs_open_context *ctx,
 		const struct nfs_lock_context *l_ctx,
@@ -546,6 +547,7 @@ extern unsigned short max_session_slots;
 extern unsigned short max_session_cb_slots;
 extern unsigned short send_implementation_id;
 extern bool recover_lost_locks;
+extern short nfs_delay_retrans;
 
 #define NFS4_CLIENT_ID_UNIQ_LEN		(64)
 extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5ee283eb9660..8a943fffaad5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -585,6 +585,21 @@ wait_on_recovery:
 	return 0;
 }
 
+/*
+ * Track the number of NFS4ERR_DELAY related retransmissions and return
+ * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit
+ * set by 'nfs_delay_retrans'.
+ */
+static int nfs4_exception_should_retrans(const struct nfs_server *server,
+					 struct nfs4_exception *exception)
+{
+	if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) {
+		if (exception->retrans++ >= (unsigned short)nfs_delay_retrans)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  */
@@ -595,6 +610,11 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
 
 	ret = nfs4_do_handle_exception(server, errorcode, exception);
 	if (exception->delay) {
+		int ret2 = nfs4_exception_should_retrans(server, exception);
+		if (ret2 < 0) {
+			exception->retry = 0;
+			return ret2;
+		}
 		ret = nfs4_delay(&exception->timeout,
 				exception->interruptible);
 		goto out_retry;
@@ -623,6 +643,11 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
 
 	ret = nfs4_do_handle_exception(server, errorcode, exception);
 	if (exception->delay) {
+		int ret2 = nfs4_exception_should_retrans(server, exception);
+		if (ret2 < 0) {
+			exception->retry = 0;
+			return ret2;
+		}
 		rpc_delay(task, nfs4_update_delay(&exception->timeout));
 		goto out_retry;
 	}
@@ -5011,9 +5036,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data)
 }
 
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct page *page, unsigned int len, struct iattr *sattr,
+		struct folio *folio, unsigned int len, struct iattr *sattr,
 		struct nfs4_label *label)
 {
+	struct page *page = &folio->page;
 	struct nfs4_createdata *data;
 	int status = -ENAMETOOLONG;
 
@@ -5038,7 +5064,7 @@ out:
 }
 
 static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct page *page, unsigned int len, struct iattr *sattr)
+		struct folio *folio, unsigned int len, struct iattr *sattr)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
@@ -5049,7 +5075,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
 
 	do {
-		err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label);
+		err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label);
 		trace_nfs4_symlink(dir, &dentry->d_name, err);
 		err = nfs4_handle_exception(NFS_SERVER(dir), err,
 				&exception);
@@ -5622,7 +5648,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
-	nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
+	nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -5663,7 +5689,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
 	data->res.server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
-	nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+	nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+			NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
 }
 
 static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
@@ -8934,6 +8961,7 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 
 	sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
 
+try_again:
 	/* Test connection for session trunking. Async exchange_id call */
 	task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
 	if (IS_ERR(task))
@@ -8946,11 +8974,15 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 
 	if (status == 0)
 		rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
-	else if (rpc_clnt_xprt_switch_has_addr(clnt,
+	else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
 				(struct sockaddr *)&xprt->addr))
 		rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
 
 	rpc_put_task(task);
+	if (status == -NFS4ERR_DELAY) {
+		ssleep(1);
+		goto try_again;
+	}
 }
 EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
@@ -9621,6 +9653,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 
 	nfs4_sequence_free_slot(&lgp->res.seq_res);
 
+	exception->state = NULL;
+	exception->stateid = NULL;
+
 	switch (nfs4err) {
 	case 0:
 		goto out;
@@ -9716,7 +9751,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 };
 
 struct pnfs_layout_segment *
-nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+		    struct nfs4_exception *exception)
 {
 	struct inode *inode = lgp->args.inode;
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -9736,13 +9772,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
 			 RPC_TASK_MOVEABLE,
 	};
 	struct pnfs_layout_segment *lseg = NULL;
-	struct nfs4_exception exception = {
-		.inode = inode,
-		.timeout = *timeout,
-	};
 	int status = 0;
 
 	nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
+	exception->retry = 0;
 
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
@@ -9753,11 +9786,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
 		goto out;
 
 	if (task->tk_status < 0) {
-		status = nfs4_layoutget_handle_exception(task, lgp, &exception);
-		*timeout = exception.timeout;
+		exception->retry = 1;
+		status = nfs4_layoutget_handle_exception(task, lgp, exception);
 	} else if (lgp->res.layoutp->len == 0) {
+		exception->retry = 1;
 		status = -EAGAIN;
-		*timeout = nfs4_update_delay(&exception.timeout);
+		nfs4_update_delay(&exception->timeout);
 	} else
 		lseg = pnfs_layout_process(lgp);
 out:
@@ -10737,7 +10771,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
 };
 #endif
 
-const struct xattr_handler *nfs4_xattr_handlers[] = {
+const struct xattr_handler * const nfs4_xattr_handlers[] = {
 	&nfs4_xattr_nfs4_acl_handler,
 #if defined(CONFIG_NFS_V4_1)
 	&nfs4_xattr_nfs4_dacl_handler,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 84343aefbbd6..21a365357629 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1980,7 +1980,9 @@ pnfs_update_layout(struct inode *ino,
 	struct pnfs_layout_segment *lseg = NULL;
 	struct nfs4_layoutget *lgp;
 	nfs4_stateid stateid;
-	long timeout = 0;
+	struct nfs4_exception exception = {
+		.inode = ino,
+	};
 	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
 	bool first;
 
@@ -2144,7 +2146,7 @@ lookup_again:
 	lgp->lo = lo;
 	pnfs_get_layout_hdr(lo);
 
-	lseg = nfs4_proc_layoutget(lgp, &timeout);
+	lseg = nfs4_proc_layoutget(lgp, &exception);
 	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 	nfs_layoutget_end(lo);
@@ -2171,6 +2173,8 @@ lookup_again:
 			goto out_put_layout_hdr;
 		}
 		if (lseg) {
+			if (!exception.retry)
+				goto out_put_layout_hdr;
 			if (first)
 				pnfs_clear_first_layoutget(lo);
 			trace_pnfs_update_layout(ino, pos, count,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d886c8226d8f..db57a85500ee 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -35,6 +35,7 @@
 #include <linux/nfs_page.h>
 #include <linux/workqueue.h>
 
+struct nfs4_exception;
 struct nfs4_opendata;
 
 enum {
@@ -245,7 +246,9 @@ extern size_t max_response_pages(struct nfs_server *server);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
 				   struct pnfs_device *dev,
 				   const struct cred *cred);
-extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout);
+extern struct pnfs_layout_segment *
+nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
+		    struct nfs4_exception *exception);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 
 /* pnfs.c */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e3570c656b0f..ad3a321ae997 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name)
 }
 
 static int
-nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio,
 		 unsigned int len, struct iattr *sattr)
 {
+	struct page *page = &folio->page;
 	struct nfs_fh *fh;
 	struct nfs_fattr *fattr;
 	struct nfs_symlinkargs	arg = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..075b31c93f87 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void)
 }
 #endif /* CONFIG_NFS_V4_2 */
 
-static struct shrinker acl_shrinker = {
-	.count_objects	= nfs_access_cache_count,
-	.scan_objects	= nfs_access_cache_scan,
-	.seeks		= DEFAULT_SEEKS,
-};
+static struct shrinker *acl_shrinker;
 
 /*
  * Register the NFS filesystems
@@ -153,9 +149,18 @@ int __init register_nfs_fs(void)
 	ret = nfs_register_sysctl();
 	if (ret < 0)
 		goto error_2;
-	ret = register_shrinker(&acl_shrinker, "nfs-acl");
-	if (ret < 0)
+
+	acl_shrinker = shrinker_alloc(0, "nfs-acl");
+	if (!acl_shrinker) {
+		ret = -ENOMEM;
 		goto error_3;
+	}
+
+	acl_shrinker->count_objects = nfs_access_cache_count;
+	acl_shrinker->scan_objects = nfs_access_cache_scan;
+
+	shrinker_register(acl_shrinker);
+
 #ifdef CONFIG_NFS_V4_2
 	nfs_ssc_register_ops();
 #endif
@@ -175,7 +180,7 @@ error_0:
  */
 void __exit unregister_nfs_fs(void)
 {
-	unregister_shrinker(&acl_shrinker);
+	shrinker_free(acl_shrinker);
 	nfs_unregister_sysctl();
 	unregister_nfs4_fs();
 #ifdef CONFIG_NFS_V4_2
@@ -1071,7 +1076,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 		sb->s_export_op = &nfs_export_ops;
 		break;
 	case 4:
-		sb->s_flags |= SB_POSIXACL;
+		sb->s_iflags |= SB_I_NOUMASK;
 		sb->s_time_gran = 1;
 		sb->s_time_min = S64_MIN;
 		sb->s_time_max = S64_MAX;
@@ -1366,6 +1371,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
 unsigned short send_implementation_id = 1;
 char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 bool recover_lost_locks = false;
+short nfs_delay_retrans = -1;
 
 EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
@@ -1376,6 +1382,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots);
 EXPORT_SYMBOL_GPL(send_implementation_id);
 EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
 EXPORT_SYMBOL_GPL(recover_lost_locks);
+EXPORT_SYMBOL_GPL(nfs_delay_retrans);
 
 #define NFS_CALLBACK_MAXPORTNR (65535U)
 
@@ -1424,5 +1431,9 @@ MODULE_PARM_DESC(recover_lost_locks,
 		 "If the server reports that a lock might be lost, "
 		 "try to recover it risking data corruption.");
 
-
+module_param_named(delay_retrans, nfs_delay_retrans, short, 0644);
+MODULE_PARM_DESC(delay_retrans,
+		 "Unless negative, specifies the number of times the NFSv4 "
+		 "client retries a request before returning an EAGAIN error, "
+		 "after a reply of NFS4ERR_DELAY from the server.");
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9d82d50ce0b1..b664caea8b4e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -739,6 +739,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 					&pgio);
 		pgio.pg_error = 0;
 		nfs_pageio_complete(&pgio);
+		if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR)
+			break;
 	} while (err < 0 && !nfs_error_is_fatal(err));
 	nfs_io_completion_put(ioc);
 
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6fffc8f03f74..b8736a82e57c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,7 +12,8 @@ nfsd-y			+= trace.o
 
 nfsd-y 			+= nfssvc.o nfsctl.o nfsfh.o vfs.o \
 			   export.o auth.o lockd.o nfscache.o \
-			   stats.o filecache.o nfs3proc.o nfs3xdr.o
+			   stats.o filecache.o nfs3proc.o nfs3xdr.o \
+			   netlink.o
 nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index fdf2aad73470..e6beaaf4f170 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 
-	validate_process_creds();
-
 	/* discard any old override before preparing the new set */
 	revert_creds(get_cred(current_real_cred()));
 	new = prepare_creds();
@@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	validate_process_creds();
 	put_cred(override_creds(new));
 	put_cred(new);
-	validate_process_creds();
 	return 0;
 
 oom:
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 01d7fd108cf3..46fd74d91ea9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -117,12 +117,13 @@ static __be32
 nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
 		struct iomap *iomaps, int nr_iomaps)
 {
+	struct timespec64 mtime = inode_get_mtime(inode);
 	loff_t new_size = lcp->lc_last_wr + 1;
 	struct iattr iattr = { .ia_valid = 0 };
 	int error;
 
 	if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
-	    timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+	    timespec64_compare(&lcp->lc_mtime, &mtime) < 0)
 		lcp->lc_mtime = current_time(inode);
 	iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
 	iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 1ed2f691ebb9..ce78f74715ee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -16,9 +16,9 @@
 
 __be32
 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp)
+		const struct nfsd4_layoutget *lgp)
 {
-	struct pnfs_block_extent *b = lgp->lg_content;
+	const struct pnfs_block_extent *b = lgp->lg_content;
 	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
 	__be32 *p;
 
@@ -77,7 +77,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
 
 __be32
 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp)
+		const struct nfsd4_getdeviceinfo *gdp)
 {
 	struct pnfs_block_deviceaddr *dev = gdp->gd_device;
 	int len = sizeof(__be32), ret, i;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index bc5166bfe46b..b0361e8aa9a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr {
 };
 
 __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp);
+		const struct nfsd4_getdeviceinfo *gdp);
 __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp);
+		const struct nfsd4_layoutget *lgp);
 int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
 		u32 block_size);
 int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 929248c6ca84..4cbe0434cbb8 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,8 +84,8 @@ int	nfsd_net_reply_cache_init(struct nfsd_net *nn);
 void	nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
-int	nfsd_cache_lookup(struct svc_rqst *rqstp,
-			  struct nfsd_cacherep **cacherep);
+int	nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+			  unsigned int len, struct nfsd_cacherep **cacherep);
 void	nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
 			  int cachetype, __be32 *statp);
 int	nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 11a0eaa2f914..7b641095a665 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats)
 
 static void export_stats_reset(struct export_stats *stats)
 {
-	nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+	if (stats)
+		nfsd_percpu_counters_reset(stats->counter,
+					   EXP_STATS_COUNTERS_NUM);
 }
 
 static void export_stats_destroy(struct export_stats *stats)
 {
-	nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+	if (stats)
+		nfsd_percpu_counters_destroy(stats->counter,
+					     EXP_STATS_COUNTERS_NUM);
 }
 
 static void svc_export_put(struct kref *ref)
@@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref)
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
-	export_stats_destroy(&exp->ex_stats);
+	export_stats_destroy(exp->ex_stats);
+	kfree(exp->ex_stats);
 	kfree(exp->ex_uuid);
 	kfree_rcu(exp, ex_rcu);
 }
@@ -421,8 +426,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid)
 		return -EINVAL;
 	}
 
-	if (!inode->i_sb->s_export_op ||
-	    !inode->i_sb->s_export_op->fh_to_dentry) {
+	if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) {
 		dprintk("exp_export: export of invalid fs type.\n");
 		return -EINVAL;
 	}
@@ -767,13 +771,15 @@ static int svc_export_show(struct seq_file *m,
 	seq_putc(m, '\t');
 	seq_escape(m, exp->ex_client->name, " \t\n\\");
 	if (export_stats) {
-		seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+		struct percpu_counter *counter = exp->ex_stats->counter;
+
+		seq_printf(m, "\t%lld\n", exp->ex_stats->start_time);
 		seq_printf(m, "\tfh_stale: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE]));
 		seq_printf(m, "\tio_read: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ]));
 		seq_printf(m, "\tio_write: %lld\n",
-			   percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+			   percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE]));
 		seq_putc(m, '\n');
 		return 0;
 	}
@@ -819,7 +825,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_layout_types = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
-	export_stats_reset(&new->ex_stats);
+	export_stats_reset(new->ex_stats);
 }
 
 static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -856,7 +862,14 @@ static struct cache_head *svc_export_alloc(void)
 	if (!i)
 		return NULL;
 
-	if (export_stats_init(&i->ex_stats)) {
+	i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL);
+	if (!i->ex_stats) {
+		kfree(i);
+		return NULL;
+	}
+
+	if (export_stats_init(i->ex_stats)) {
+		kfree(i->ex_stats);
 		kfree(i);
 		return NULL;
 	}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2df8ae25aad3..ca9dc230ae3d 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -64,10 +64,10 @@ struct svc_export {
 	struct cache_head	h;
 	struct auth_domain *	ex_client;
 	int			ex_flags;
+	int			ex_fsid;
 	struct path		ex_path;
 	kuid_t			ex_anon_uid;
 	kgid_t			ex_anon_gid;
-	int			ex_fsid;
 	unsigned char *		ex_uuid; /* 16 byte fsid */
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
@@ -76,8 +76,8 @@ struct svc_export {
 	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 	struct rcu_head		ex_rcu;
-	struct export_stats	ex_stats;
 	unsigned long		ex_xprtsec_modes;
+	struct export_stats	*ex_stats;
 };
 
 /* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ee9c923192e0..ef063f93fde9 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
 	return ret;
 }
 
-static struct shrinker	nfsd_file_shrinker = {
-	.scan_objects = nfsd_file_lru_scan,
-	.count_objects = nfsd_file_lru_count,
-	.seeks = 1,
-};
+static struct shrinker *nfsd_file_shrinker;
 
 /**
  * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
@@ -746,12 +742,19 @@ nfsd_file_cache_init(void)
 		goto out_err;
 	}
 
-	ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
-	if (ret) {
-		pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+	nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache");
+	if (!nfsd_file_shrinker) {
+		ret = -ENOMEM;
+		pr_err("nfsd: failed to allocate nfsd_file_shrinker\n");
 		goto out_lru;
 	}
 
+	nfsd_file_shrinker->count_objects = nfsd_file_lru_count;
+	nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan;
+	nfsd_file_shrinker->seeks = 1;
+
+	shrinker_register(nfsd_file_shrinker);
+
 	ret = lease_register_notifier(&nfsd_file_lease_notifier);
 	if (ret) {
 		pr_err("nfsd: unable to register lease notifier: %d\n", ret);
@@ -774,7 +777,7 @@ out:
 out_notifier:
 	lease_unregister_notifier(&nfsd_file_lease_notifier);
 out_shrinker:
-	unregister_shrinker(&nfsd_file_shrinker);
+	shrinker_free(nfsd_file_shrinker);
 out_lru:
 	list_lru_destroy(&nfsd_file_lru);
 out_err:
@@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void)
 		return;
 
 	lease_unregister_notifier(&nfsd_file_lease_notifier);
-	unregister_shrinker(&nfsd_file_shrinker);
+	shrinker_free(nfsd_file_shrinker);
 	/*
 	 * make sure all callers of nfsd_file_lru_cb are done before
 	 * calling nfsd_file_cache_purge
@@ -989,22 +992,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_file *new, *nf;
-	const struct cred *cred;
+	bool stale_retry = true;
 	bool open_retry = true;
 	struct inode *inode;
 	__be32 status;
 	int ret;
 
+retry:
 	status = fh_verify(rqstp, fhp, S_IFREG,
 				may_flags|NFSD_MAY_OWNER_OVERRIDE);
 	if (status != nfs_ok)
 		return status;
 	inode = d_inode(fhp->fh_dentry);
-	cred = get_current_cred();
 
-retry:
 	rcu_read_lock();
-	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+	nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
 	rcu_read_unlock();
 
 	if (nf) {
@@ -1026,7 +1028,7 @@ retry:
 
 	rcu_read_lock();
 	spin_lock(&inode->i_lock);
-	nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+	nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
 	if (unlikely(nf)) {
 		spin_unlock(&inode->i_lock);
 		rcu_read_unlock();
@@ -1058,6 +1060,7 @@ wait_for_construction:
 			goto construction_err;
 		}
 		open_retry = false;
+		fh_put(fhp);
 		goto retry;
 	}
 	this_cpu_inc(nfsd_file_cache_hits);
@@ -1074,7 +1077,6 @@ out:
 		nfsd_file_check_write_error(nf);
 		*pnf = nf;
 	}
-	put_cred(cred);
 	trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
 	return status;
 
@@ -1088,8 +1090,20 @@ open_file:
 			status = nfs_ok;
 			trace_nfsd_file_opened(nf, status);
 		} else {
-			status = nfsd_open_verified(rqstp, fhp, may_flags,
-						    &nf->nf_file);
+			ret = nfsd_open_verified(rqstp, fhp, may_flags,
+						 &nf->nf_file);
+			if (ret == -EOPENSTALE && stale_retry) {
+				stale_retry = false;
+				nfsd_file_unhash(nf);
+				clear_and_wake_up_bit(NFSD_FILE_PENDING,
+						      &nf->nf_flags);
+				if (refcount_dec_and_test(&nf->nf_ref))
+					nfsd_file_free(nf);
+				nf = NULL;
+				fh_put(fhp);
+				goto retry;
+			}
+			status = nfserrno(ret);
 			trace_nfsd_file_open(nf, status);
 		}
 	} else
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index bb205328e043..aeb71c10ff1b 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -17,9 +17,9 @@ struct ff_idmap {
 
 __be32
 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp)
+		const struct nfsd4_layoutget *lgp)
 {
-	struct pnfs_ff_layout *fl = lgp->lg_content;
+	const struct pnfs_ff_layout *fl = lgp->lg_content;
 	int len, mirror_len, ds_len, fh_len;
 	__be32 *p;
 
@@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
 
 __be32
 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp)
+		const struct nfsd4_getdeviceinfo *gdp)
 {
 	struct pnfs_ff_device_addr *da = gdp->gd_device;
 	int len;
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
index 8e195aeca023..6d5a1066a903 100644
--- a/fs/nfsd/flexfilelayoutxdr.h
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -43,8 +43,8 @@ struct pnfs_ff_layout {
 };
 
 __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
-		struct nfsd4_getdeviceinfo *gdp);
+		const struct nfsd4_getdeviceinfo *gdp);
 __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
-		struct nfsd4_layoutget *lgp);
+		const struct nfsd4_layoutget *lgp);
 
 #endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
new file mode 100644
index 000000000000..0e1d635ec5f9
--- /dev/null
+++ b/fs/nfsd/netlink.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink.h"
+
+#include <uapi/linux/nfsd_netlink.h>
+
+/* Ops table for nfsd */
+static const struct genl_split_ops nfsd_nl_ops[] = {
+	{
+		.cmd	= NFSD_CMD_RPC_STATUS_GET,
+		.start	= nfsd_nl_rpc_status_get_start,
+		.dumpit	= nfsd_nl_rpc_status_get_dumpit,
+		.done	= nfsd_nl_rpc_status_get_done,
+		.flags	= GENL_CMD_CAP_DUMP,
+	},
+};
+
+struct genl_family nfsd_nl_family __ro_after_init = {
+	.name		= NFSD_FAMILY_NAME,
+	.version	= NFSD_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= nfsd_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(nfsd_nl_ops),
+};
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
new file mode 100644
index 000000000000..d83dd6bdee92
--- /dev/null
+++ b/fs/nfsd/netlink.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NFSD_GEN_H
+#define _LINUX_NFSD_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/nfsd_netlink.h>
+
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
+
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+				  struct netlink_callback *cb);
+
+extern struct genl_family nfsd_nl_family;
+
+#endif /* _LINUX_NFSD_GEN_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ec49b200b797..ab303a8b77d5 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -177,7 +177,7 @@ struct nfsd_net {
 	/* size of cache when we saw the longest hash chain */
 	unsigned int             longest_chain_cachesize;
 
-	struct shrinker		nfsd_reply_cache_shrinker;
+	struct shrinker		*nfsd_reply_cache_shrinker;
 
 	/* tracking server-to-server copy mounts */
 	spinlock_t              nfsd_ssc_lock;
@@ -195,7 +195,7 @@ struct nfsd_net {
 	int			nfs4_max_clients;
 
 	atomic_t		nfsd_courtesy_clients;
-	struct shrinker		nfsd_client_shrinker;
+	struct shrinker		*nfsd_client_shrinker;
 	struct work_struct	nfsd_shrinker_work;
 };
 
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 268ef57751c4..b78eceebd945 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 	 * + 1 (xdr opaque byte count) = 26
 	 */
 	resp->count = argp->count;
-	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+	svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) +
+			 resp->count + 4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
@@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 				SVCFH_fmt(&argp->fh),
 				argp->len,
 				(unsigned long long) argp->offset,
-				argp->stable? " stable" : "");
+				argp->stable ? " stable" : "");
 
 	resp->status = nfserr_fbig;
 	if (argp->offset > (u64)OFFSET_MAX ||
@@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS3_CREATE_EXCLUSIVE:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				break;
 			}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e8a80052cb1b..5e8096bc5eaa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 	if (!list_empty(&ls->ls_layouts)) {
 		if (found)
 			nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
-		lrp->lrs_present = 1;
+		lrp->lrs_present = true;
 	} else {
 		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
 		nfs4_unhash_stid(&ls->ls_stid);
-		lrp->lrs_present = 0;
+		lrp->lrs_present = false;
 	}
 	spin_unlock(&ls->ls_lock);
 
@@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
 	struct nfs4_layout *lp, *t;
 	LIST_HEAD(reaplist);
 
-	lrp->lrs_present = 0;
+	lrp->lrs_present = false;
 
 	spin_lock(&clp->cl_lock);
 	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4199ede0583c..6f2d4aa4970d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS4_CREATE_EXCLUSIVE:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				open->op_created = true;
 				break;		/* subtle */
@@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			status = nfserr_exist;
 			break;
 		case NFS4_CREATE_EXCLUSIVE4_1:
-			if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
-			    d_inode(child)->i_atime.tv_sec == v_atime &&
+			if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+			    inode_get_atime_sec(d_inode(child)) == v_atime &&
 			    d_inode(child)->i_size == 0) {
 				open->op_created = true;
 				goto set_attr;	/* subtle */
@@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
  * setup a work entry in the ssc delayed unmount list.
  */
 static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
-				  struct nfsd4_ssc_umount_item **nsui)
+				  struct nfsd4_ssc_umount_item **nsui,
+				  struct svc_rqst *rqstp)
 {
 	struct nfsd4_ssc_umount_item *ni = NULL;
 	struct nfsd4_ssc_umount_item *work = NULL;
@@ -1351,7 +1352,7 @@ try_again:
 			spin_unlock(&nn->nfsd_ssc_lock);
 
 			/* allow 20secs for mount/unmount for now - revisit */
-			if (kthread_should_stop() ||
+			if (svc_thread_should_stop(rqstp) ||
 					(schedule_timeout(20*HZ) == 0)) {
 				finish_wait(&nn->nfsd_ssc_waitq, &wait);
 				kfree(work);
@@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
 		goto out_free_rawdata;
 	snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
 
-	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+	status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
 	if (status)
 		goto out_free_devname;
 	if ((*nsui)->nsui_vfsmount)
@@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
 	if (bytes_total == 0)
 		bytes_total = ULLONG_MAX;
 	do {
+		/* Only async copies can be stopped here */
 		if (kthread_should_stop())
 			break;
 		bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
@@ -1760,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data)
 	struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
 	__be32 nfserr;
 
+	trace_nfsd_copy_do_async(copy);
 	if (nfsd4_ssc_is_inter(copy)) {
 		struct file *filp;
 
@@ -1798,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct nfsd4_copy *async_copy = NULL;
 
+	copy->cp_clp = cstate->clp;
 	if (nfsd4_ssc_is_inter(copy)) {
+		trace_nfsd_copy_inter(copy);
 		if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
 			status = nfserr_notsupp;
 			goto out;
 		}
 		status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
-		if (status)
+		if (status) {
+			trace_nfsd_copy_done(copy, status);
 			return nfserr_offload_denied;
+		}
 	} else {
+		trace_nfsd_copy_intra(copy);
 		status = nfsd4_setup_intra_ssc(rqstp, cstate, copy);
-		if (status)
+		if (status) {
+			trace_nfsd_copy_done(copy, status);
 			return status;
+		}
 	}
 
-	copy->cp_clp = cstate->clp;
 	memcpy(&copy->fh, &cstate->current_fh.fh_handle,
 		sizeof(struct knfsd_fh));
 	if (nfsd4_copy_is_async(copy)) {
@@ -1847,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				       copy->nf_dst->nf_file, true);
 	}
 out:
+	trace_nfsd_copy_done(copy, status);
 	release_copy_files(copy);
 	return status;
 out_err:
@@ -1929,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	cn->cpn_sec = nn->nfsd4_lease;
-	cn->cpn_nsec = 0;
+	cn->cpn_lease_time.tv_sec = nn->nfsd4_lease;
+	cn->cpn_lease_time.tv_nsec = 0;
 
 	status = nfserrno(-ENOMEM);
 	cps = nfs4_alloc_init_cpntf_state(nn, stid);
@@ -2347,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 	mutex_unlock(&ls->ls_mutex);
 
 	if (new_size > i_size_read(inode)) {
-		lcp->lc_size_chg = 1;
+		lcp->lc_size_chg = true;
 		lcp->lc_newsize = new_size;
 	} else {
-		lcp->lc_size_chg = 0;
+		lcp->lc_size_chg = false;
 	}
 
 	nfserr = ops->proc_layoutcommit(inode, lcp);
@@ -3200,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCK] = {
 		.op_func = nfsd4_lock,
+		.op_release = nfsd4_lock_release,
 		.op_flags = OP_MODIFIES_SOMETHING |
 				OP_NONTRIVIAL_ERROR_ENCODE,
 		.op_name = "OP_LOCK",
@@ -3208,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
 	},
 	[OP_LOCKT] = {
 		.op_func = nfsd4_lockt,
+		.op_release = nfsd4_lockt_release,
 		.op_flags = OP_NONTRIVIAL_ERROR_ENCODE,
 		.op_name = "OP_LOCKT",
 		.op_rsize_bop = nfsd4_lock_rsize,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8534693eb6a4..3edbfa0233e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -59,7 +59,7 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
-#define all_ones {{~0,~0},~0}
+#define all_ones {{ ~0, ~0}, ~0}
 static const stateid_t one_stateid = {
 	.si_generation = ~0,
 	.si_opaque = all_ones,
@@ -297,7 +297,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
 
 	nbl = find_blocked_lock(lo, fh, nn);
 	if (!nbl) {
-		nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+		nbl = kmalloc(sizeof(*nbl), GFP_KERNEL);
 		if (nbl) {
 			INIT_LIST_HEAD(&nbl->nbl_list);
 			INIT_LIST_HEAD(&nbl->nbl_lru);
@@ -1159,6 +1159,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 		 struct nfs4_clnt_odstate *odstate, u32 dl_type)
 {
 	struct nfs4_delegation *dp;
+	struct nfs4_stid *stid;
 	long n;
 
 	dprintk("NFSD alloc_init_deleg\n");
@@ -1167,9 +1168,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 		goto out_dec;
 	if (delegation_blocked(&fp->fi_fhandle))
 		goto out_dec;
-	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
-	if (dp == NULL)
+	stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg);
+	if (stid == NULL)
 		goto out_dec;
+	dp = delegstateid(stid);
 
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
@@ -2797,7 +2799,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
 
 	/* XXX: alternatively, we could get/drop in seq start/stop */
 	drop_client(clp);
-	return 0;
+	return seq_release(inode, file);
 }
 
 static const struct file_operations client_states_fops = {
@@ -4400,8 +4402,7 @@ static unsigned long
 nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	int count;
-	struct nfsd_net *nn = container_of(shrink,
-			struct nfsd_net, nfsd_client_shrinker);
+	struct nfsd_net *nn = shrink->private_data;
 
 	count = atomic_read(&nn->nfsd_courtesy_clients);
 	if (!count)
@@ -5636,11 +5637,11 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	int status = 0;
 
 	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
-	open->op_recall = 0;
+	open->op_recall = false;
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_PREVIOUS:
 			if (!cb_up)
-				open->op_recall = 1;
+				open->op_recall = true;
 			break;
 		case NFS4_OPEN_CLAIM_NULL:
 			parent = currentfh;
@@ -5682,7 +5683,7 @@ out_no_deleg:
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
 	    open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
 		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
-		open->op_recall = 1;
+		open->op_recall = true;
 	}
 
 	/* 4.1 client asking for a delegation? */
@@ -7487,6 +7488,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
+	struct super_block *sb;
 	__be32 status = 0;
 	int lkflg;
 	int err;
@@ -7508,6 +7510,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		dprintk("NFSD: nfsd4_lock: permission denied!\n");
 		return status;
 	}
+	sb = cstate->current_fh.fh_dentry->d_sb;
 
 	if (lock->lk_is_new) {
 		if (nfsd4_has_session(cstate))
@@ -7559,7 +7562,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
 		case NFS4_READW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_READ_LT:
@@ -7571,7 +7575,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			fl_type = F_RDLCK;
 			break;
 		case NFS4_WRITEW_LT:
-			if (nfsd4_has_session(cstate))
+			if (nfsd4_has_session(cstate) ||
+			    exportfs_lock_op_is_async(sb->s_export_op))
 				fl_flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_WRITE_LT:
@@ -7599,7 +7604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * for file locks), so don't attempt blocking lock notifications
 	 * on those filesystems:
 	 */
-	if (nf->nf_file->f_op->lock)
+	if (!exportfs_lock_op_is_async(sb->s_export_op))
 		fl_flags &= ~FL_SLEEP;
 
 	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
@@ -7705,6 +7710,14 @@ out:
 	return status;
 }
 
+void nfsd4_lock_release(union nfsd4_op_u *u)
+{
+	struct nfsd4_lock *lock = &u->lock;
+	struct nfsd4_lock_denied *deny = &lock->lk_denied;
+
+	kfree(deny->ld_owner.data);
+}
+
 /*
  * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
  * so we do a temporary open here just to get an open file to pass to
@@ -7810,6 +7823,14 @@ out:
 	return status;
 }
 
+void nfsd4_lockt_release(union nfsd4_op_u *u)
+{
+	struct nfsd4_lockt *lockt = &u->lockt;
+	struct nfsd4_lock_denied *deny = &lockt->lt_denied;
+
+	kfree(deny->ld_owner.data);
+}
+
 __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    union nfsd4_op_u *u)
@@ -8149,12 +8170,16 @@ static int nfs4_state_create_net(struct net *net)
 	INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
 	get_net(net);
 
-	nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
-	nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
-	nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
-
-	if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"))
+	nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client");
+	if (!nn->nfsd_client_shrinker)
 		goto err_shrinker;
+
+	nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan;
+	nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count;
+	nn->nfsd_client_shrinker->private_data = nn;
+
+	shrinker_register(nn->nfsd_client_shrinker);
+
 	return 0;
 
 err_shrinker:
@@ -8252,7 +8277,7 @@ nfs4_state_shutdown_net(struct net *net)
 	struct list_head *pos, *next, reaplist;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	unregister_shrinker(&nn->nfsd_client_shrinker);
+	shrinker_free(nn->nfsd_client_shrinker);
 	cancel_work(&nn->nfsd_shrinker_work);
 	cancel_delayed_work_sync(&nn->laundromat_work);
 	locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 92c7dde148a4..b499fe9caa32 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	return true;
 }
 
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
-			     struct svc_export *exp)
+static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
+				   struct knfsd_fh *fh_handle)
 {
-	if (exp->ex_flags & NFSEXP_V4ROOT) {
-		*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
-		*p++ = 0;
-	} else
-		p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
-	return p;
+	return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
 }
 
+/* This is a frequently-encoded type; open-coded for speed */
 static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
-				    struct timespec64 *tv)
+				    const struct timespec64 *tv)
 {
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 	if (!p)
 		return nfserr_resource;
-
-	p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+	p = xdr_encode_hyper(p, tv->tv_sec);
 	*p = cpu_to_be32(tv->tv_nsec);
 	return nfs_ok;
 }
 
-/*
- * ctime (in NFSv4, time_metadata) is not writeable, and the client
- * doesn't really care what resolution could theoretically be stored by
- * the filesystem.
- *
- * The client cares how close together changes can be while still
- * guaranteeing ctime changes.  For most filesystems (which have
- * timestamps with nanosecond fields) that is limited by the resolution
- * of the time returned from current_time() (which I'm assuming to be
- * 1/HZ).
- */
-static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr,
+				     unsigned int major, unsigned int minor)
 {
-	struct timespec64 ts;
-	u32 ns;
+	__be32 status;
 
-	ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
-	ts = ns_to_timespec64(ns);
+	status = nfsd4_encode_uint32_t(xdr, major);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_uint32_t(xdr, minor);
+}
 
-	p = xdr_encode_hyper(p, ts.tv_sec);
-	*p++ = cpu_to_be32(ts.tv_nsec);
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c)
+{
+	__be32 status;
 
-	return p;
+	status = nfsd4_encode_bool(xdr, c->atomic);
+	if (status != nfs_ok)
+		return status;
+	status = nfsd4_encode_changeid4(xdr, c->before_change);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_changeid4(xdr, c->after_change);
 }
 
-static __be32
-nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
+static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr,
+				    const struct nfs42_netaddr *addr)
 {
-	if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
-		return nfserr_resource;
-	return nfs_ok;
+	__be32 status;
+
+	/* na_r_netid */
+	status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len);
+	if (status != nfs_ok)
+		return status;
+	/* na_r_addr */
+	return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len);
 }
 
 /* Encode as an array of strings the string given with components
@@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
 	return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
 }
 
-/*
- * encode a location element of a fs_locations structure
- */
 static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
 					struct nfsd4_fs_location *location)
 {
@@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
 	status = nfsd4_encode_components(xdr, '/', location->path);
 	if (status)
 		return status;
-	return 0;
+	return nfs_ok;
 }
 
-/*
- * Encode a path in RFC3530 'pathname4' format
- */
-static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
-				const struct path *root,
-				const struct path *path)
+static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr,
+				     const struct path *root,
+				     const struct path *path)
 {
 	struct path cur = *path;
 	__be32 *p;
@@ -2752,89 +2742,59 @@ out_free:
 	return err;
 }
 
-static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
-			struct svc_rqst *rqstp, const struct path *path)
+static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr,
+					 struct svc_rqst *rqstp,
+					 struct svc_export *exp)
 {
+	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
 	struct svc_export *exp_ps;
-	__be32 res;
+	unsigned int i;
+	__be32 status;
 
+	/* fs_root */
 	exp_ps = rqst_find_fsidzero_export(rqstp);
 	if (IS_ERR(exp_ps))
 		return nfserrno(PTR_ERR(exp_ps));
-	res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+	status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path);
 	exp_put(exp_ps);
-	return res;
-}
-
-/*
- *  encode a fs_locations structure
- */
-static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
-			struct svc_rqst *rqstp, struct svc_export *exp)
-{
-	__be32 status;
-	int i;
-	__be32 *p;
-	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-
-	status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
-	if (status)
+	if (status != nfs_ok)
 		return status;
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+
+	/* locations<> */
+	if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT)
 		return nfserr_resource;
-	*p++ = cpu_to_be32(fslocs->locations_count);
-	for (i=0; i<fslocs->locations_count; i++) {
+	for (i = 0; i < fslocs->locations_count; i++) {
 		status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
-		if (status)
+		if (status != nfs_ok)
 			return status;
 	}
-	return 0;
-}
 
-static u32 nfs4_file_type(umode_t mode)
-{
-	switch (mode & S_IFMT) {
-	case S_IFIFO:	return NF4FIFO;
-	case S_IFCHR:	return NF4CHR;
-	case S_IFDIR:	return NF4DIR;
-	case S_IFBLK:	return NF4BLK;
-	case S_IFLNK:	return NF4LNK;
-	case S_IFREG:	return NF4REG;
-	case S_IFSOCK:	return NF4SOCK;
-	default:	return NF4BAD;
-	}
+	return nfs_ok;
 }
 
-static inline __be32
-nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
-		     struct nfs4_ace *ace)
+static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+				   struct nfs4_ace *ace)
 {
+	__be32 status;
+
+	/* type */
+	status = nfsd4_encode_acetype4(xdr, ace->type);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* flag */
+	status = nfsd4_encode_aceflag4(xdr, ace->flag);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* access mask */
+	status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* who */
 	if (ace->whotype != NFS4_ACL_WHO_NAMED)
 		return nfs4_acl_write_who(xdr, ace->whotype);
-	else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+	if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
 		return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
-	else
-		return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
-}
-
-static inline __be32
-nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
-{
-	__be32		*p;
-	unsigned long	i = hweight_long(layout_types);
-
-	p = xdr_reserve_space(xdr, 4 + 4 * i);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = cpu_to_be32(i);
-
-	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
-		if (layout_types & (1 << i))
-			*p++ = cpu_to_be32(i);
-
-	return 0;
+	return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
 }
 
 #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
 }
 
 static __be32
-nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
 {
 	__be32 *p;
 
 	if (bmval2) {
-		p = xdr_reserve_space(xdr, 16);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 4);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(3);
@@ -2919,94 +2879,684 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
 		*p++ = cpu_to_be32(bmval1);
 		*p++ = cpu_to_be32(bmval2);
 	} else if (bmval1) {
-		p = xdr_reserve_space(xdr, 12);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(2);
 		*p++ = cpu_to_be32(bmval0);
 		*p++ = cpu_to_be32(bmval1);
 	} else {
-		p = xdr_reserve_space(xdr, 8);
+		p = xdr_reserve_space(xdr, XDR_UNIT * 2);
 		if (!p)
 			goto out_resource;
 		*p++ = cpu_to_be32(1);
 		*p++ = cpu_to_be32(bmval0);
 	}
 
-	return 0;
+	return nfs_ok;
 out_resource:
 	return nfserr_resource;
 }
 
+struct nfsd4_fattr_args {
+	struct svc_rqst		*rqstp;
+	struct svc_fh		*fhp;
+	struct svc_export	*exp;
+	struct dentry		*dentry;
+	struct kstat		stat;
+	struct kstatfs		statfs;
+	struct nfs4_acl		*acl;
+	u64			size;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	void			*context;
+	int			contextlen;
+#endif
+	u32			rdattr_err;
+	bool			contextsupport;
+	bool			ignore_crossmnt;
+};
+
+typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
+				const struct nfsd4_fattr_args *args);
+
+static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, true);
+}
+
+static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_bool(xdr, false);
+}
+
+static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr,
+						  const struct nfsd4_fattr_args *args)
+{
+	struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
+	u32 supp[3];
+
+	memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+	if (!IS_POSIXACL(d_inode(args->dentry)))
+		supp[0] &= ~FATTR4_WORD0_ACL;
+	if (!args->contextsupport)
+		supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+	return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT);
+	if (!p)
+		return nfserr_resource;
+
+	switch (args->stat.mode & S_IFMT) {
+	case S_IFIFO:
+		*p = cpu_to_be32(NF4FIFO);
+		break;
+	case S_IFCHR:
+		*p = cpu_to_be32(NF4CHR);
+		break;
+	case S_IFDIR:
+		*p = cpu_to_be32(NF4DIR);
+		break;
+	case S_IFBLK:
+		*p = cpu_to_be32(NF4BLK);
+		break;
+	case S_IFLNK:
+		*p = cpu_to_be32(NF4LNK);
+		break;
+	case S_IFREG:
+		*p = cpu_to_be32(NF4REG);
+		break;
+	case S_IFSOCK:
+		*p = cpu_to_be32(NF4SOCK);
+		break;
+	default:
+		return nfserr_serverfault;
+	}
+
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr,
+						 const struct nfsd4_fattr_args *args)
+{
+	u32 mask;
+
+	mask = NFS4_FH_PERSISTENT;
+	if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+		mask |= NFS4_FH_VOL_RENAME;
+	return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	const struct svc_export *exp = args->exp;
+	u64 c;
+
+	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
+
+		if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)
+			return nfserr_resource;
+		if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+			return nfserr_resource;
+		return nfs_ok;
+	}
+
+	c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry));
+	return nfsd4_encode_changeid4(xdr, c);
+}
+
+static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->size);
+}
+
+static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2);
+	if (!p)
+		return nfserr_resource;
+
+	if (unlikely(args->exp->ex_fslocs.migrated)) {
+		p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+		xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+		return nfs_ok;
+	}
+	switch (fsid_source(args->fhp)) {
+	case FSIDSOURCE_FSID:
+		p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid);
+		xdr_encode_hyper(p, (u64)0);
+		break;
+	case FSIDSOURCE_DEV:
+		*p++ = xdr_zero;
+		*p++ = cpu_to_be32(MAJOR(args->stat.dev));
+		*p++ = xdr_zero;
+		*p   = cpu_to_be32(MINOR(args->stat.dev));
+		break;
+	case FSIDSOURCE_UUID:
+		xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN);
+		break;
+	}
+
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id);
+
+	return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease);
+}
+
+static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->rdattr_err);
+}
+
+static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	u32 mask;
+
+	mask = 0;
+	if (IS_POSIXACL(d_inode(args->dentry)))
+		mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL;
+	return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr,
+				      const struct nfsd4_fattr_args *args)
+{
+	struct nfs4_acl *acl = args->acl;
+	struct nfs4_ace *ace;
+	__be32 status;
+
+	/* nfsace4<> */
+	if (!acl) {
+		if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+			return nfserr_resource;
+	} else {
+		if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT)
+			return nfserr_resource;
+		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+			status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace);
+			if (status != nfs_ok)
+				return status;
+		}
+	}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+}
+
+static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->stat.ino);
+}
+
+static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, args->statfs.f_files);
+}
+
+static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp);
+}
+
+static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	struct super_block *sb = args->exp->ex_path.mnt->mnt_sb;
+
+	return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes);
+}
+
+static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, 255);
+}
+
+static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen);
+}
+
+static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr,
+					  const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr,
+					   const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr,
+				       const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO);
+}
+
+static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr,
+					   const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->stat.nlink);
+}
+
+static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr,
+					const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid);
+}
+
+static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid);
+}
+
+static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr,
+					 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev),
+				      MINOR(args->stat.rdev));
+}
+
+static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, avail);
+}
+
+static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, free);
+}
+
+static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize;
+
+	return nfsd4_encode_uint64_t(xdr, total);
+}
+
+static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9);
+}
+
+static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.atime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.btime);
+}
+
+/*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+ * the filesystem.
+ *
+ * The client cares how close together changes can be while still
+ * guaranteeing ctime changes.  For most filesystems (which have
+ * timestamps with nanosecond fields) that is limited by the resolution
+ * of the time returned from current_time() (which I'm assuming to be
+ * 1/HZ).
+ */
+static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr,
+					     const struct nfsd4_fattr_args *args)
+{
+	const struct inode *inode = d_inode(args->dentry);
+	u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
+	struct timespec64 ts = ns_to_timespec64(ns);
+
+	return nfsd4_encode_nfstime4(xdr, &ts);
+}
+
+static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr,
+						const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.ctime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr,
+					      const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_nfstime4(xdr, &args->stat.mtime);
+}
+
+static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr,
+						    const struct nfsd4_fattr_args *args)
+{
+	u64 ino;
+	int err;
+
+	if (!args->ignore_crossmnt &&
+	    args->dentry == args->exp->ex_path.mnt->mnt_root) {
+		err = nfsd4_get_mounted_on_ino(args->exp, &ino);
+		if (err)
+			return nfserrno(err);
+	} else
+		ino = args->stat.ino;
+
+	return nfsd4_encode_uint64_t(xdr, ino);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+
+static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr,
+						  const struct nfsd4_fattr_args *args)
+{
+	unsigned long mask = args->exp->ex_layout_types;
+	int i;
+
+	/* Hamming weight of @mask is the number of layout types to return */
+	if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+		return nfserr_resource;
+	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+		if (mask & BIT(i)) {
+			/* layouttype4 */
+			if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+				return nfserr_resource;
+		}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr,
+					       const struct nfsd4_fattr_args *args)
+{
+	unsigned long mask = args->exp->ex_layout_types;
+	int i;
+
+	/* Hamming weight of @mask is the number of layout types to return */
+	if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+		return nfserr_resource;
+	for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+		if (mask & BIT(i)) {
+			/* layouttype4 */
+			if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+				return nfserr_resource;
+		}
+	return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr,
+						 const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_uint32_t(xdr, args->stat.blksize);
+}
+
+#endif
+
+static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
+						     const struct nfsd4_fattr_args *args)
+{
+	struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+	u32 supp[3];
+
+	memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp));
+	supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+	supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+	supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+	return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
+					    const struct nfsd4_fattr_args *args)
+{
+	return nfsd4_encode_security_label(xdr, args->rqstp,
+					   args->context, args->contextlen);
+}
+#endif
+
+static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
+						const struct nfsd4_fattr_args *args)
+{
+	int err = xattr_supports_user_prefix(d_inode(args->dentry));
+
+	return nfsd4_encode_bool(xdr, err == 0);
+}
+
+static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
+	[FATTR4_SUPPORTED_ATTRS]	= nfsd4_encode_fattr4_supported_attrs,
+	[FATTR4_TYPE]			= nfsd4_encode_fattr4_type,
+	[FATTR4_FH_EXPIRE_TYPE]		= nfsd4_encode_fattr4_fh_expire_type,
+	[FATTR4_CHANGE]			= nfsd4_encode_fattr4_change,
+	[FATTR4_SIZE]			= nfsd4_encode_fattr4_size,
+	[FATTR4_LINK_SUPPORT]		= nfsd4_encode_fattr4__true,
+	[FATTR4_SYMLINK_SUPPORT]	= nfsd4_encode_fattr4__true,
+	[FATTR4_NAMED_ATTR]		= nfsd4_encode_fattr4__false,
+	[FATTR4_FSID]			= nfsd4_encode_fattr4_fsid,
+	[FATTR4_UNIQUE_HANDLES]		= nfsd4_encode_fattr4__true,
+	[FATTR4_LEASE_TIME]		= nfsd4_encode_fattr4_lease_time,
+	[FATTR4_RDATTR_ERROR]		= nfsd4_encode_fattr4_rdattr_error,
+	[FATTR4_ACL]			= nfsd4_encode_fattr4_acl,
+	[FATTR4_ACLSUPPORT]		= nfsd4_encode_fattr4_aclsupport,
+	[FATTR4_ARCHIVE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CANSETTIME]		= nfsd4_encode_fattr4__true,
+	[FATTR4_CASE_INSENSITIVE]	= nfsd4_encode_fattr4__false,
+	[FATTR4_CASE_PRESERVING]	= nfsd4_encode_fattr4__true,
+	[FATTR4_CHOWN_RESTRICTED]	= nfsd4_encode_fattr4__true,
+	[FATTR4_FILEHANDLE]		= nfsd4_encode_fattr4_filehandle,
+	[FATTR4_FILEID]			= nfsd4_encode_fattr4_fileid,
+	[FATTR4_FILES_AVAIL]		= nfsd4_encode_fattr4_files_avail,
+	[FATTR4_FILES_FREE]		= nfsd4_encode_fattr4_files_free,
+	[FATTR4_FILES_TOTAL]		= nfsd4_encode_fattr4_files_total,
+	[FATTR4_FS_LOCATIONS]		= nfsd4_encode_fattr4_fs_locations,
+	[FATTR4_HIDDEN]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_HOMOGENEOUS]		= nfsd4_encode_fattr4__true,
+	[FATTR4_MAXFILESIZE]		= nfsd4_encode_fattr4_maxfilesize,
+	[FATTR4_MAXLINK]		= nfsd4_encode_fattr4_maxlink,
+	[FATTR4_MAXNAME]		= nfsd4_encode_fattr4_maxname,
+	[FATTR4_MAXREAD]		= nfsd4_encode_fattr4_maxread,
+	[FATTR4_MAXWRITE]		= nfsd4_encode_fattr4_maxwrite,
+	[FATTR4_MIMETYPE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_MODE]			= nfsd4_encode_fattr4_mode,
+	[FATTR4_NO_TRUNC]		= nfsd4_encode_fattr4__true,
+	[FATTR4_NUMLINKS]		= nfsd4_encode_fattr4_numlinks,
+	[FATTR4_OWNER]			= nfsd4_encode_fattr4_owner,
+	[FATTR4_OWNER_GROUP]		= nfsd4_encode_fattr4_owner_group,
+	[FATTR4_QUOTA_AVAIL_HARD]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_QUOTA_AVAIL_SOFT]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_QUOTA_USED]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RAWDEV]			= nfsd4_encode_fattr4_rawdev,
+	[FATTR4_SPACE_AVAIL]		= nfsd4_encode_fattr4_space_avail,
+	[FATTR4_SPACE_FREE]		= nfsd4_encode_fattr4_space_free,
+	[FATTR4_SPACE_TOTAL]		= nfsd4_encode_fattr4_space_total,
+	[FATTR4_SPACE_USED]		= nfsd4_encode_fattr4_space_used,
+	[FATTR4_SYSTEM]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_ACCESS]		= nfsd4_encode_fattr4_time_access,
+	[FATTR4_TIME_ACCESS_SET]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_BACKUP]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_TIME_CREATE]		= nfsd4_encode_fattr4_time_create,
+	[FATTR4_TIME_DELTA]		= nfsd4_encode_fattr4_time_delta,
+	[FATTR4_TIME_METADATA]		= nfsd4_encode_fattr4_time_metadata,
+	[FATTR4_TIME_MODIFY]		= nfsd4_encode_fattr4_time_modify,
+	[FATTR4_TIME_MODIFY_SET]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_MOUNTED_ON_FILEID]	= nfsd4_encode_fattr4_mounted_on_fileid,
+	[FATTR4_DIR_NOTIF_DELAY]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_DIRENT_NOTIF_DELAY]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_DACL]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_SACL]			= nfsd4_encode_fattr4__noop,
+	[FATTR4_CHANGE_POLICY]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_FS_STATUS]		= nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_PNFS
+	[FATTR4_FS_LAYOUT_TYPES]	= nfsd4_encode_fattr4_fs_layout_types,
+	[FATTR4_LAYOUT_HINT]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_TYPES]		= nfsd4_encode_fattr4_layout_types,
+	[FATTR4_LAYOUT_BLKSIZE]		= nfsd4_encode_fattr4_layout_blksize,
+	[FATTR4_LAYOUT_ALIGNMENT]	= nfsd4_encode_fattr4__noop,
+#else
+	[FATTR4_FS_LAYOUT_TYPES]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_HINT]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_TYPES]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_BLKSIZE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_LAYOUT_ALIGNMENT]	= nfsd4_encode_fattr4__noop,
+#endif
+
+	[FATTR4_FS_LOCATIONS_INFO]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_MDSTHRESHOLD]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_GET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_SET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTEVT_GET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTEVT_SET]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_RETENTION_HOLD]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_MODE_SET_MASKED]	= nfsd4_encode_fattr4__noop,
+	[FATTR4_SUPPATTR_EXCLCREAT]	= nfsd4_encode_fattr4_suppattr_exclcreat,
+	[FATTR4_FS_CHARSET_CAP]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CLONE_BLKSIZE]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_SPACE_FREED]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_CHANGE_ATTR_TYPE]	= nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	[FATTR4_SEC_LABEL]		= nfsd4_encode_fattr4_sec_label,
+#else
+	[FATTR4_SEC_LABEL]		= nfsd4_encode_fattr4__noop,
+#endif
+
+	[FATTR4_MODE_UMASK]		= nfsd4_encode_fattr4__noop,
+	[FATTR4_XATTR_SUPPORT]		= nfsd4_encode_fattr4_xattr_support,
+};
+
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves.
  */
 static __be32
-nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
-		struct svc_export *exp,
-		struct dentry *dentry, u32 *bmval,
-		struct svc_rqst *rqstp, int ignore_crossmnt)
+nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+		    struct svc_fh *fhp, struct svc_export *exp,
+		    struct dentry *dentry, const u32 *bmval,
+		    int ignore_crossmnt)
 {
-	u32 bmval0 = bmval[0];
-	u32 bmval1 = bmval[1];
-	u32 bmval2 = bmval[2];
-	struct kstat stat;
+	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
-	struct kstatfs statfs;
-	__be32 *p, *attrlen_p;
 	int starting_len = xdr->buf->len;
+	__be32 *attrlen_p, status;
 	int attrlen_offset;
-	u32 dummy;
-	u64 dummy64;
-	u32 rdattr_err = 0;
-	__be32 status;
 	int err;
-	struct nfs4_acl *acl = NULL;
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	void *context = NULL;
-	int contextlen;
-#endif
-	bool contextsupport = false;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
 	struct path path = {
 		.mnt	= exp->ex_path.mnt,
 		.dentry	= dentry,
 	};
-	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	union {
+		u32		attrmask[3];
+		unsigned long	mask[2];
+	} u;
+	unsigned long bit;
+
+	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
+	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
 
-	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
+	args.rqstp = rqstp;
+	args.exp = exp;
+	args.dentry = dentry;
+	args.ignore_crossmnt = (ignore_crossmnt != 0);
 
+	/*
+	 * Make a local copy of the attribute bitmap that can be modified.
+	 */
+	memset(&u, 0, sizeof(u));
+	u.attrmask[0] = bmval[0];
+	u.attrmask[1] = bmval[1];
+	u.attrmask[2] = bmval[2];
+
+	args.rdattr_err = 0;
 	if (exp->ex_fslocs.migrated) {
-		status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
+		status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
+						&u.attrmask[2], &args.rdattr_err);
 		if (status)
 			goto out;
 	}
-	if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+	args.size = 0;
+	if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
 		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
 		if (status)
 			goto out;
 	}
 
-	err = vfs_getattr(&path, &stat,
+	err = vfs_getattr(&path, &args.stat,
 			  STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
 			  AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
-	if (!(stat.result_mask & STATX_BTIME))
+	args.size = args.stat.size;
+
+	if (!(args.stat.result_mask & STATX_BTIME))
 		/* underlying FS does not offer btime so we can't share it */
-		bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
-	if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+		u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
 			FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
-	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+	    (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		err = vfs_statfs(&path, &statfs);
+		err = vfs_statfs(&path, &args.statfs);
 		if (err)
 			goto out_nfserr;
 	}
-	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+	if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+	    !fhp) {
 		tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
 		status = nfserr_jukebox;
 		if (!tempfh)
@@ -3015,12 +3565,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 		status = fh_compose(tempfh, exp, dentry, NULL);
 		if (status)
 			goto out;
-		fhp = tempfh;
-	}
-	if (bmval0 & FATTR4_WORD0_ACL) {
-		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+		args.fhp = tempfh;
+	} else
+		args.fhp = fhp;
+
+	args.acl = NULL;
+	if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+		err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
 		if (err == -EOPNOTSUPP)
-			bmval0 &= ~FATTR4_WORD0_ACL;
+			u.attrmask[0] &= ~FATTR4_WORD0_ACL;
 		else if (err == -EINVAL) {
 			status = nfserr_attrnotsupp;
 			goto out;
@@ -3028,452 +3581,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 			goto out_nfserr;
 	}
 
+	args.contextsupport = false;
+
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
-	     bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+	args.context = NULL;
+	if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+	     u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
 		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
 			err = security_inode_getsecctx(d_inode(dentry),
-						&context, &contextlen);
+						&args.context, &args.contextlen);
 		else
 			err = -EOPNOTSUPP;
-		contextsupport = (err == 0);
-		if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+		args.contextsupport = (err == 0);
+		if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
 			if (err == -EOPNOTSUPP)
-				bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+				u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 			else if (err)
 				goto out_nfserr;
 		}
 	}
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 
-	status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+	/* attrmask */
+	status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
+				      u.attrmask[1], u.attrmask[2]);
 	if (status)
 		goto out;
 
+	/* attr_vals */
 	attrlen_offset = xdr->buf->len;
 	attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
 	if (!attrlen_p)
 		goto out_resource;
-
-	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 supp[3];
-
-		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-
-		if (!IS_POSIXACL(dentry->d_inode))
-			supp[0] &= ~FATTR4_WORD0_ACL;
-		if (!contextsupport)
-			supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-		if (!supp[2]) {
-			p = xdr_reserve_space(xdr, 12);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(2);
-			*p++ = cpu_to_be32(supp[0]);
-			*p++ = cpu_to_be32(supp[1]);
-		} else {
-			p = xdr_reserve_space(xdr, 16);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(3);
-			*p++ = cpu_to_be32(supp[0]);
-			*p++ = cpu_to_be32(supp[1]);
-			*p++ = cpu_to_be32(supp[2]);
-		}
-	}
-	if (bmval0 & FATTR4_WORD0_TYPE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		dummy = nfs4_file_type(stat.mode);
-		if (dummy == NF4BAD) {
-			status = nfserr_serverfault;
+	for_each_set_bit(bit, (const unsigned long *)&u.mask,
+			 ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
+		if (status != nfs_ok)
 			goto out;
-		}
-		*p++ = cpu_to_be32(dummy);
-	}
-	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
-			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
-		else
-			*p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
-						NFS4_FH_VOL_RENAME);
-	}
-	if (bmval0 & FATTR4_WORD0_CHANGE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = encode_change(p, &stat, d_inode(dentry), exp);
 	}
-	if (bmval0 & FATTR4_WORD0_SIZE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, stat.size);
-	}
-	if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_FSID) {
-		p = xdr_reserve_space(xdr, 16);
-		if (!p)
-			goto out_resource;
-		if (exp->ex_fslocs.migrated) {
-			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
-			p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
-		} else switch(fsid_source(fhp)) {
-		case FSIDSOURCE_FSID:
-			p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
-			p = xdr_encode_hyper(p, (u64)0);
-			break;
-		case FSIDSOURCE_DEV:
-			*p++ = cpu_to_be32(0);
-			*p++ = cpu_to_be32(MAJOR(stat.dev));
-			*p++ = cpu_to_be32(0);
-			*p++ = cpu_to_be32(MINOR(stat.dev));
-			break;
-		case FSIDSOURCE_UUID:
-			p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
-								EX_UUID_LEN);
-			break;
-		}
-	}
-	if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(nn->nfsd4_lease);
-	}
-	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(rdattr_err);
-	}
-	if (bmval0 & FATTR4_WORD0_ACL) {
-		struct nfs4_ace *ace;
-
-		if (acl == NULL) {
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				goto out_resource;
-
-			*p++ = cpu_to_be32(0);
-			goto out_acl;
-		}
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(acl->naces);
-
-		for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
-			p = xdr_reserve_space(xdr, 4*3);
-			if (!p)
-				goto out_resource;
-			*p++ = cpu_to_be32(ace->type);
-			*p++ = cpu_to_be32(ace->flag);
-			*p++ = cpu_to_be32(ace->access_mask &
-							NFS4_ACE_MASK_ALL);
-			status = nfsd4_encode_aclname(xdr, rqstp, ace);
-			if (status)
-				goto out;
-		}
-	}
-out_acl:
-	if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
-			ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
-	}
-	if (bmval0 & FATTR4_WORD0_CANSETTIME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(0);
-	}
-	if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
-		p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
-					fhp->fh_handle.fh_size);
-	}
-	if (bmval0 & FATTR4_WORD0_FILEID) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, stat.ino);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_FREE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
-	}
-	if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) statfs.f_files);
-	}
-	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
-		status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
-		if (status)
-			goto out;
-	}
-	if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXLINK) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(255);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXNAME) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(statfs.f_namelen);
-	}
-	if (bmval0 & FATTR4_WORD0_MAXREAD) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
-	}
-	if (bmval0 & FATTR4_WORD0_MAXWRITE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
-	}
-	if (bmval1 & FATTR4_WORD1_MODE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.mode & S_IALLUGO);
-	}
-	if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(1);
-	}
-	if (bmval1 & FATTR4_WORD1_NUMLINKS) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.nlink);
-	}
-	if (bmval1 & FATTR4_WORD1_OWNER) {
-		status = nfsd4_encode_user(xdr, rqstp, stat.uid);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
-		status = nfsd4_encode_group(xdr, rqstp, stat.gid);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_RAWDEV) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
-		*p++ = cpu_to_be32((u32) MINOR(stat.rdev));
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_SPACE_USED) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			goto out_resource;
-		dummy64 = (u64)stat.blocks << 9;
-		p = xdr_encode_hyper(p, dummy64);
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.atime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.btime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
-		p = xdr_reserve_space(xdr, 12);
-		if (!p)
-			goto out_resource;
-		p = encode_time_delta(p, d_inode(dentry));
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
-		status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
-		if (status)
-			goto out;
-	}
-	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
-		u64 ino = stat.ino;
-
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-                	goto out_resource;
-		/*
-		 * Get ino of mountpoint in parent filesystem, if not ignoring
-		 * crossmount and this is the root of a cross-mounted
-		 * filesystem.
-		 */
-		if (ignore_crossmnt == 0 &&
-		    dentry == exp->ex_path.mnt->mnt_root) {
-			err = nfsd4_get_mounted_on_ino(exp, &ino);
-			if (err)
-				goto out_nfserr;
-		}
-		p = xdr_encode_hyper(p, ino);
-	}
-#ifdef CONFIG_NFSD_PNFS
-	if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-		status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
-		if (status)
-			goto out;
-	}
-
-	if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
-		status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
-		if (status)
-			goto out;
-	}
-
-	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		*p++ = cpu_to_be32(stat.blksize);
-	}
-#endif /* CONFIG_NFSD_PNFS */
-	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
-		u32 supp[3];
-
-		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-		supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
-		supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
-		supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
-
-		status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
-		if (status)
-			goto out;
-	}
-
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
-		status = nfsd4_encode_security_label(xdr, rqstp, context,
-								contextlen);
-		if (status)
-			goto out;
-	}
-#endif
-
-	if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			goto out_resource;
-		err = xattr_supports_user_prefix(d_inode(dentry));
-		*p++ = cpu_to_be32(err == 0);
-	}
-
 	*attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
 	status = nfs_ok;
 
 out:
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	if (context)
-		security_release_secctx(context, contextlen);
+	if (args.context)
+		security_release_secctx(args.context, args.contextlen);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
-	kfree(acl);
+	kfree(args.acl);
 	if (tempfh) {
 		fh_put(tempfh);
 		kfree(tempfh);
@@ -3514,12 +3668,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 	__be32 ret;
 
 	svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
-	ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
-							ignore_crossmnt);
+	ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval,
+				  ignore_crossmnt);
 	*p = xdr.p;
 	return ret;
 }
 
+/*
+ * The buffer space for this field was reserved during a previous
+ * call to nfsd4_encode_entry4().
+ */
+static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir,
+					    u64 offset)
+{
+	__be64 cookie = cpu_to_be64(offset);
+	struct xdr_stream *xdr = readdir->xdr;
+
+	if (!readdir->cookie_offset)
+		return;
+	write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie,
+			       sizeof(cookie));
+}
+
 static inline int attributes_need_mount(u32 *bmval)
 {
 	if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
@@ -3530,8 +3700,8 @@ static inline int attributes_need_mount(u32 *bmval)
 }
 
 static __be32
-nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
-			const char *name, int namlen)
+nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
+			  int namlen)
 {
 	struct svc_export *exp = cd->rd_fhp->fh_export;
 	struct dentry *dentry;
@@ -3574,33 +3744,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 
 	}
 out_encode:
-	nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
-					cd->rd_rqstp, ignore_crossmnt);
+	nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry,
+				     cd->rd_bmval, ignore_crossmnt);
 out_put:
 	dput(dentry);
 	exp_put(exp);
 	return nfserr;
 }
 
-static __be32 *
-nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+static __be32
+nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
 {
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 20);
-	if (!p)
-		return NULL;
-	*p++ = htonl(2);
-	*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
-	*p++ = htonl(0);			 /* bmval1 */
+	__be32 status;
 
-	*p++ = htonl(4);     /* attribute length */
-	*p++ = nfserr;       /* no htonl */
-	return p;
+	/* attrmask */
+	status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0);
+	if (status != nfs_ok)
+		return status;
+	/* attr_vals */
+	if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT)
+		return nfserr_resource;
+	/* rdattr_error */
+	if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT)
+		return nfserr_resource;
+	return nfs_ok;
 }
 
 static int
-nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+nfsd4_encode_entry4(void *ccdv, const char *name, int namlen,
 		    loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct readdir_cd *ccd = ccdv;
@@ -3611,8 +3782,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	u32 name_and_cookie;
 	int entry_bytes;
 	__be32 nfserr = nfserr_toosmall;
-	__be64 wire_offset;
-	__be32 *p;
 
 	/* In nfsv4, "." and ".." never make it onto the wire.. */
 	if (name && isdotent(name, namlen)) {
@@ -3620,24 +3789,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		return 0;
 	}
 
-	if (cd->cookie_offset) {
-		wire_offset = cpu_to_be64(offset);
-		write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
-							&wire_offset, 8);
-	}
+	/* Encode the previous entry's cookie value */
+	nfsd4_encode_entry4_nfs_cookie4(cd, offset);
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	if (xdr_stream_encode_item_present(xdr) != XDR_UNIT)
 		goto fail;
-	*p++ = xdr_one;                             /* mark entry present */
+
+	/* Reserve send buffer space for this entry's cookie value. */
 	cookie_offset = xdr->buf->len;
-	p = xdr_reserve_space(xdr, 3*4 + namlen);
-	if (!p)
+	if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok)
 		goto fail;
-	p = xdr_encode_hyper(p, OFFSET_MAX);        /* offset of next entry */
-	p = xdr_encode_array(p, name, namlen);      /* name length & name */
-
-	nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+	if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok)
+		goto fail;
+	nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen);
 	switch (nfserr) {
 	case nfs_ok:
 		break;
@@ -3668,8 +3832,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 		 */
 		if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
 			goto fail;
-		p = nfsd4_encode_rdattr_error(xdr, nfserr);
-		if (p == NULL) {
+		if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) {
 			nfserr = nfserr_toosmall;
 			goto fail;
 		}
@@ -3727,18 +3890,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
 	return nfs_ok;
 }
 
+/* This is a frequently-encoded item; open-coded for speed */
 static __be32
-nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
 {
 	__be32 *p;
 
-	p = xdr_reserve_space(xdr, sizeof(stateid_t));
+	p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (!p)
 		return nfserr_resource;
 	*p++ = cpu_to_be32(sid->si_generation);
-	p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
-					sizeof(stateid_opaque_t));
-	return 0;
+	memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque));
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_sessionid4(struct xdr_stream *xdr,
+			const struct nfs4_sessionid *sessionid)
+{
+	return nfsd4_encode_opaque_fixed(xdr, sessionid->data,
+					 NFS4_MAX_SESSIONID_LEN);
 }
 
 static __be32
@@ -3747,14 +3918,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_access *access = &u->access;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 8);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(access->ac_supported);
-	*p++ = cpu_to_be32(access->ac_resp_access);
-	return 0;
+	/* supported */
+	status = nfsd4_encode_uint32_t(xdr, access->ac_supported);
+	if (status != nfs_ok)
+		return status;
+	/* access */
+	return nfsd4_encode_uint32_t(xdr, access->ac_resp_access);
 }
 
 static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
@@ -3762,17 +3933,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 {
 	struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
-	if (!p)
+	/* bctsr_sessid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* bctsr_dir */
+	if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT)
 		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(bcts->dir);
-	/* Upshifting from TCP to RDMA is not supported */
-	*p++ = cpu_to_be32(0);
-	return 0;
+	/* bctsr_use_conn_in_rdma_mode */
+	return nfsd4_encode_bool(xdr, false);
 }
 
 static __be32
@@ -3782,7 +3952,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_close *close = &u->close;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &close->cl_stateid);
 }
 
 
@@ -3802,11 +3973,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_create *create = &u->create;
 	struct xdr_stream *xdr = resp->xdr;
 
+	/* cinfo */
 	nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
 	if (nfserr)
 		return nfserr;
-	return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
-			create->cr_bmval[1], create->cr_bmval[2]);
+	/* attrset */
+	return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0],
+				    create->cr_bmval[1], create->cr_bmval[2]);
 }
 
 static __be32
@@ -3817,65 +3990,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct svc_fh *fhp = getattr->ga_fhp;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
-				    getattr->ga_bmval, resp->rqstp, 0);
+	/* obj_attributes */
+	return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export,
+				   fhp->fh_dentry, getattr->ga_bmval, 0);
 }
 
 static __be32
 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
 		   union nfsd4_op_u *u)
 {
-	struct svc_fh **fhpp = &u->getfh;
 	struct xdr_stream *xdr = resp->xdr;
-	struct svc_fh *fhp = *fhpp;
-	unsigned int len;
-	__be32 *p;
+	struct svc_fh *fhp = u->getfh;
 
-	len = fhp->fh_handle.fh_size;
-	p = xdr_reserve_space(xdr, len + 4);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
-	return 0;
+	/* object */
+	return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle);
 }
 
-/*
-* Including all fields other than the name, a LOCK4denied structure requires
-*   8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
-*/
 static __be32
-nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid,
+			 const struct xdr_netobj *owner)
 {
-	struct xdr_netobj *conf = &ld->ld_owner;
-	__be32 *p;
+	__be32 status;
 
-again:
-	p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
-	if (!p) {
-		/*
-		 * Don't fail to return the result just because we can't
-		 * return the conflicting open:
-		 */
-		if (conf->len) {
-			kfree(conf->data);
-			conf->len = 0;
-			conf->data = NULL;
-			goto again;
-		}
+	/* clientid */
+	status = nfsd4_encode_clientid4(xdr, clientid);
+	if (status != nfs_ok)
+		return status;
+	/* owner */
+	return nfsd4_encode_opaque(xdr, owner->data, owner->len);
+}
+
+static __be32
+nfsd4_encode_lock4denied(struct xdr_stream *xdr,
+			 const struct nfsd4_lock_denied *ld)
+{
+	__be32 status;
+
+	/* offset */
+	status = nfsd4_encode_offset4(xdr, ld->ld_start);
+	if (status != nfs_ok)
+		return status;
+	/* length */
+	status = nfsd4_encode_length4(xdr, ld->ld_length);
+	if (status != nfs_ok)
+		return status;
+	/* locktype */
+	if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT)
 		return nfserr_resource;
-	}
-	p = xdr_encode_hyper(p, ld->ld_start);
-	p = xdr_encode_hyper(p, ld->ld_length);
-	*p++ = cpu_to_be32(ld->ld_type);
-	if (conf->len) {
-		p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
-		p = xdr_encode_opaque(p, conf->data, conf->len);
-		kfree(conf->data);
-	}  else {  /* non - nfsv4 lock in conflict, no clientid nor owner */
-		p = xdr_encode_hyper(p, (u64)0); /* clientid */
-		*p++ = cpu_to_be32(0); /* length of owner name */
-	}
-	return nfserr_denied;
+	/* owner */
+	return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid,
+					&ld->ld_owner);
 }
 
 static __be32
@@ -3884,13 +4048,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_lock *lock = &u->lock;
 	struct xdr_stream *xdr = resp->xdr;
+	__be32 status;
 
-	if (!nfserr)
-		nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
-	else if (nfserr == nfserr_denied)
-		nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
-
-	return nfserr;
+	switch (nfserr) {
+	case nfs_ok:
+		/* resok4 */
+		status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid);
+		break;
+	case nfserr_denied:
+		/* denied */
+		status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied);
+		break;
+	default:
+		return nfserr;
+	}
+	return status != nfs_ok ? status : nfserr;
 }
 
 static __be32
@@ -3899,9 +4071,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_lockt *lockt = &u->lockt;
 	struct xdr_stream *xdr = resp->xdr;
+	__be32 status;
 
-	if (nfserr == nfserr_denied)
-		nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+	if (nfserr == nfserr_denied) {
+		/* denied */
+		status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied);
+		if (status != nfs_ok)
+			return status;
+	}
 	return nfserr;
 }
 
@@ -3912,7 +4089,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_locku *locku = &u->locku;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+	/* lock_stateid */
+	return nfsd4_encode_stateid4(xdr, &locku->lu_stateid);
 }
 
 
@@ -3926,104 +4104,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
 }
 
+/*
+ * This implementation does not yet support returning an ACE in an
+ * OPEN that offers a delegation.
+ */
+static __be32
+nfsd4_encode_open_nfsace4(struct xdr_stream *xdr)
+{
+	__be32 status;
+
+	/* type */
+	status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* flag */
+	status = nfsd4_encode_aceflag4(xdr, 0);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* access mask */
+	status = nfsd4_encode_acemask4(xdr, 0);
+	if (status != nfs_ok)
+		return nfserr_resource;
+	/* who - empty for now */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+		return nfserr_resource;
+	return nfs_ok;
+}
 
 static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  union nfsd4_op_u *u)
+nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
 {
-	struct nfsd4_open *open = &u->open;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
-	if (nfserr)
-		return nfserr;
-	nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
-	if (nfserr)
-		return nfserr;
-	if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
+	/* stateid */
+	status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+	if (status != nfs_ok)
+		return status;
+	/* recall */
+	status = nfsd4_encode_bool(xdr, open->op_recall);
+	if (status != nfs_ok)
+		return status;
+	/* permissions */
+	return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize)
+{
+	/* limitby */
+	if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT)
 		return nfserr_resource;
+	/* filesize */
+	return nfsd4_encode_uint64_t(xdr, filesize);
+}
 
-	nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
-					open->op_bmval[2]);
-	if (nfserr)
-		return nfserr;
+static __be32
+nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr,
+				    struct nfsd4_open *open)
+{
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	/* stateid */
+	status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+	if (status != nfs_ok)
+		return status;
+	/* recall */
+	status = nfsd4_encode_bool(xdr, open->op_recall);
+	if (status != nfs_ok)
+		return status;
+	/* space_limit */
+	status = nfsd4_encode_nfs_space_limit4(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr,
+				   struct nfsd4_open *open)
+{
+	__be32 status = nfs_ok;
+
+	/* ond_why */
+	if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT)
 		return nfserr_resource;
+	switch (open->op_why_no_deleg) {
+	case WND4_CONTENTION:
+		/* ond_server_will_push_deleg */
+		status = nfsd4_encode_bool(xdr, false);
+		break;
+	case WND4_RESOURCE:
+		/* ond_server_will_signal_avail */
+		status = nfsd4_encode_bool(xdr, false);
+	}
+	return status;
+}
+
+static __be32
+nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
+{
+	__be32 status;
 
-	*p++ = cpu_to_be32(open->op_delegate_type);
+	/* delegation_type */
+	if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT)
+		return nfserr_resource;
 	switch (open->op_delegate_type) {
 	case NFS4_OPEN_DELEGATE_NONE:
+		status = nfs_ok;
 		break;
 	case NFS4_OPEN_DELEGATE_READ:
-		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
-		if (nfserr)
-			return nfserr;
-		p = xdr_reserve_space(xdr, 20);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(open->op_recall);
-
-		/*
-		 * TODO: ACE's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		/* read */
+		status = nfsd4_encode_open_read_delegation4(xdr, open);
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
-		nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
-		if (nfserr)
-			return nfserr;
-
-		p = xdr_reserve_space(xdr, XDR_UNIT * 8);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(open->op_recall);
-
-		/*
-		 * Always flush on close
-		 *
-		 * TODO: space_limit's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
-		*p++ = xdr_zero;
-		*p++ = xdr_zero;
-
-		/*
-		 * TODO: ACE's in delegations
-		 */
-		*p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);   /* XXX: is NULL principal ok? */
+		/* write */
+		status = nfsd4_encode_open_write_delegation4(xdr, open);
 		break;
-	case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
-		switch (open->op_why_no_deleg) {
-		case WND4_CONTENTION:
-		case WND4_RESOURCE:
-			p = xdr_reserve_space(xdr, 8);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(open->op_why_no_deleg);
-			/* deleg signaling not supported yet: */
-			*p++ = cpu_to_be32(0);
-			break;
-		default:
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(open->op_why_no_deleg);
-		}
+	case NFS4_OPEN_DELEGATE_NONE_EXT:
+		/* od_whynone */
+		status = nfsd4_encode_open_none_delegation4(xdr, open);
 		break;
 	default:
-		BUG();
+		status = nfserr_serverfault;
 	}
-	/* XXX save filehandle here */
-	return 0;
+
+	return status;
+}
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
+{
+	struct nfsd4_open *open = &u->open;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* cinfo */
+	nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* rflags */
+	nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* attrset */
+	nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0],
+				      open->op_bmval[1], open->op_bmval[2]);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* delegation */
+	return nfsd4_encode_open_delegation4(xdr, open);
 }
 
 static __be32
@@ -4033,7 +4266,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_open_confirm *oc = &u->open_confirm;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid);
 }
 
 static __be32
@@ -4043,7 +4277,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
 	struct nfsd4_open_downgrade *od = &u->open_downgrade;
 	struct xdr_stream *xdr = resp->xdr;
 
-	return nfsd4_encode_stateid(xdr, &od->od_stateid);
+	/* open_stateid */
+	return nfsd4_encode_stateid4(xdr, &od->od_stateid);
 }
 
 /*
@@ -4227,90 +4462,83 @@ out_err:
 	return nfserr;
 }
 
-static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
-		     union nfsd4_op_u *u)
+static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr,
+				    struct nfsd4_readdir *readdir,
+				    u32 max_payload)
 {
-	struct nfsd4_readdir *readdir = &u->readdir;
-	int maxcount;
-	int bytes_left;
+	int bytes_left, maxcount, starting_len = xdr->buf->len;
 	loff_t offset;
-	__be64 wire_offset;
-	struct xdr_stream *xdr = resp->xdr;
-	int starting_len = xdr->buf->len;
-	__be32 *p;
-
-	nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
-	if (nfserr != nfs_ok)
-		return nfserr;
+	__be32 status;
 
 	/*
 	 * Number of bytes left for directory entries allowing for the
-	 * final 8 bytes of the readdir and a following failed op:
+	 * final 8 bytes of the readdir and a following failed op.
 	 */
-	bytes_left = xdr->buf->buflen - xdr->buf->len
-			- COMPOUND_ERR_SLACK_SPACE - 8;
-	if (bytes_left < 0) {
-		nfserr = nfserr_resource;
-		goto err_no_verf;
-	}
-	maxcount = svc_max_payload(resp->rqstp);
-	maxcount = min_t(u32, readdir->rd_maxcount, maxcount);
+	bytes_left = xdr->buf->buflen - xdr->buf->len -
+		COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2;
+	if (bytes_left < 0)
+		return nfserr_resource;
+	maxcount = min_t(u32, readdir->rd_maxcount, max_payload);
+
 	/*
-	 * Note the rfc defines rd_maxcount as the size of the
-	 * READDIR4resok structure, which includes the verifier above
-	 * and the 8 bytes encoded at the end of this function:
+	 * The RFC defines rd_maxcount as the size of the
+	 * READDIR4resok structure, which includes the verifier
+	 * and the 8 bytes encoded at the end of this function.
 	 */
-	if (maxcount < 16) {
-		nfserr = nfserr_toosmall;
-		goto err_no_verf;
-	}
-	maxcount = min_t(int, maxcount-16, bytes_left);
+	if (maxcount < XDR_UNIT * 4)
+		return nfserr_toosmall;
+	maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left);
 
-	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */
 	if (!readdir->rd_dircount)
-		readdir->rd_dircount = svc_max_payload(resp->rqstp);
+		readdir->rd_dircount = max_payload;
 
+	/* *entries */
 	readdir->xdr = xdr;
 	readdir->rd_maxcount = maxcount;
 	readdir->common.err = 0;
 	readdir->cookie_offset = 0;
-
 	offset = readdir->rd_cookie;
-	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
-			      &offset,
-			      &readdir->common, nfsd4_encode_dirent);
-	if (nfserr == nfs_ok &&
-	    readdir->common.err == nfserr_toosmall &&
-	    xdr->buf->len == starting_len + 8) {
-		/* nothing encoded; which limit did we hit?: */
-		if (maxcount - 16 < bytes_left)
-			/* It was the fault of rd_maxcount: */
-			nfserr = nfserr_toosmall;
-		else
-			/* We ran out of buffer space: */
-			nfserr = nfserr_resource;
+	status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset,
+			      &readdir->common, nfsd4_encode_entry4);
+	if (status)
+		return status;
+	if (readdir->common.err == nfserr_toosmall &&
+	    xdr->buf->len == starting_len) {
+		/* No entries were encoded. Which limit did we hit? */
+		if (maxcount - XDR_UNIT * 4 < bytes_left)
+			/* It was the fault of rd_maxcount */
+			return nfserr_toosmall;
+		/* We ran out of buffer space */
+		return nfserr_resource;
 	}
-	if (nfserr)
-		goto err_no_verf;
+	/* Encode the final entry's cookie value */
+	nfsd4_encode_entry4_nfs_cookie4(readdir, offset);
+	/* No entries follow */
+	if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT)
+		return nfserr_resource;
 
-	if (readdir->cookie_offset) {
-		wire_offset = cpu_to_be64(offset);
-		write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
-							&wire_offset, 8);
-	}
+	/* eof */
+	return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof);
+}
 
-	p = xdr_reserve_space(xdr, 8);
-	if (!p) {
-		WARN_ON_ONCE(1);
-		goto err_no_verf;
-	}
-	*p++ = 0;	/* no more entries */
-	*p++ = htonl(readdir->common.err == nfserr_eof);
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+		     union nfsd4_op_u *u)
+{
+	struct nfsd4_readdir *readdir = &u->readdir;
+	struct xdr_stream *xdr = resp->xdr;
+	int starting_len = xdr->buf->len;
 
-	return 0;
-err_no_verf:
-	xdr_truncate_encode(xdr, starting_len);
+	/* cookieverf */
+	nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+	if (nfserr != nfs_ok)
+		return nfserr;
+
+	/* reply */
+	nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp));
+	if (nfserr != nfs_ok)
+		xdr_truncate_encode(xdr, starting_len);
 	return nfserr;
 }
 
@@ -4338,13 +4566,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
 }
 
 static __be32
+nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr,
+			     struct rpcsec_gss_info *info)
+{
+	__be32 status;
+
+	/* oid */
+	if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0)
+		return nfserr_resource;
+	/* qop */
+	status = nfsd4_encode_qop4(xdr, info->qop);
+	if (status != nfs_ok)
+		return status;
+	/* service */
+	if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT)
+		return nfserr_resource;
+
+	return nfs_ok;
+}
+
+static __be32
 nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 {
 	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	__be32 *p, *flavorsp;
 	static bool report = true;
+	__be32 *flavorsp;
+	__be32 status;
 
 	if (exp->ex_nflavors) {
 		flavs = exp->ex_flavors;
@@ -4367,10 +4616,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 	}
 
 	supported = 0;
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
+	if (!flavorsp)
 		return nfserr_resource;
-	flavorsp = p++;		/* to be backfilled later */
 
 	for (i = 0; i < nflavs; i++) {
 		rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -4378,20 +4626,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 
 		if (rpcauth_get_gssinfo(pf, &info) == 0) {
 			supported++;
-			p = xdr_reserve_space(xdr, 4 + 4 +
-					      XDR_LEN(info.oid.len) + 4 + 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(RPC_AUTH_GSS);
-			p = xdr_encode_opaque(p,  info.oid.data, info.oid.len);
-			*p++ = cpu_to_be32(info.qop);
-			*p++ = cpu_to_be32(info.service);
+
+			/* flavor */
+			status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
+			if (status != nfs_ok)
+				return status;
+			/* flavor_info */
+			status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
+			if (status != nfs_ok)
+				return status;
 		} else if (pf < RPC_AUTH_MAXFLAVOR) {
 			supported++;
-			p = xdr_reserve_space(xdr, 4);
-			if (!p)
-				return nfserr_resource;
-			*p++ = cpu_to_be32(pf);
+
+			/* flavor */
+			status = nfsd4_encode_uint32_t(xdr, pf);
+			if (status != nfs_ok)
+				return status;
 		} else {
 			if (report)
 				pr_warn("NFS: SECINFO: security flavor %u "
@@ -4401,7 +4651,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
 
 	if (nflavs != supported)
 		report = false;
-	*flavorsp = htonl(supported);
+	*flavorsp = cpu_to_be32(supported);
 	return 0;
 }
 
@@ -4425,34 +4675,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
 }
 
-/*
- * The SETATTR encode routine is special -- it always encodes a bitmap,
- * regardless of the error status.
- */
 static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     union nfsd4_op_u *u)
 {
 	struct nfsd4_setattr *setattr = &u->setattr;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 16);
-	if (!p)
-		return nfserr_resource;
-	if (nfserr) {
-		*p++ = cpu_to_be32(3);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(0);
-	}
-	else {
-		*p++ = cpu_to_be32(3);
-		*p++ = cpu_to_be32(setattr->sa_bmval[0]);
-		*p++ = cpu_to_be32(setattr->sa_bmval[1]);
-		*p++ = cpu_to_be32(setattr->sa_bmval[2]);
+	switch (nfserr) {
+	case nfs_ok:
+		/* attrsset */
+		status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0],
+					      setattr->sa_bmval[1],
+					      setattr->sa_bmval[2]);
+		break;
+	default:
+		/* attrsset */
+		status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0);
 	}
-	return nfserr;
+	return status != nfs_ok ? status : nfserr;
 }
 
 static __be32
@@ -4488,86 +4729,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
 		   union nfsd4_op_u *u)
 {
 	struct nfsd4_write *write = &u->write;
+	struct xdr_stream *xdr = resp->xdr;
 
-	if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+	/* count */
+	nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written);
+	if (nfserr)
+		return nfserr;
+	/* committed */
+	if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT)
 		return nfserr_resource;
-	return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
+	/* writeverf */
+	return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
 }
 
 static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 union nfsd4_op_u *u)
+nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr,
+				struct nfsd4_exchange_id *exid)
 {
-	struct nfsd4_exchange_id *exid = &u->exchange_id;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-	char *major_id;
-	char *server_scope;
-	int major_id_sz;
-	int server_scope_sz;
-	uint64_t minor_id = 0;
-	struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+	__be32 status;
 
-	major_id = nn->nfsd_name;
-	major_id_sz = strlen(nn->nfsd_name);
-	server_scope = nn->nfsd_name;
-	server_scope_sz = strlen(nn->nfsd_name);
+	/* spo_must_enforce */
+	status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0],
+				      exid->spo_must_enforce[1],
+				      exid->spo_must_enforce[2]);
+	if (status != nfs_ok)
+		return status;
+	/* spo_must_allow */
+	return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0],
+				    exid->spo_must_allow[1],
+				    exid->spo_must_allow[2]);
+}
 
-	if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
-		return nfserr_resource;
-	if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
-		return nfserr_resource;
+static __be32
+nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid)
+{
+	__be32 status;
 
-	if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+	if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT)
 		return nfserr_resource;
 	switch (exid->spa_how) {
 	case SP4_NONE:
+		status = nfs_ok;
 		break;
 	case SP4_MACH_CRED:
-		/* spo_must_enforce bitmap: */
-		nfserr = nfsd4_encode_bitmap(xdr,
-					exid->spo_must_enforce[0],
-					exid->spo_must_enforce[1],
-					exid->spo_must_enforce[2]);
-		if (nfserr)
-			return nfserr;
-		/* spo_must_allow bitmap: */
-		nfserr = nfsd4_encode_bitmap(xdr,
-					exid->spo_must_allow[0],
-					exid->spo_must_allow[1],
-					exid->spo_must_allow[2]);
-		if (nfserr)
-			return nfserr;
+		/* spr_mach_ops */
+		status = nfsd4_encode_state_protect_ops4(xdr, exid);
 		break;
 	default:
-		WARN_ON_ONCE(1);
+		status = nfserr_serverfault;
 	}
+	return status;
+}
 
-	p = xdr_reserve_space(xdr,
-		8 /* so_minor_id */ +
-		4 /* so_major_id.len */ +
-		(XDR_QUADLEN(major_id_sz) * 4) +
-		4 /* eir_server_scope.len */ +
-		(XDR_QUADLEN(server_scope_sz) * 4) +
-		4 /* eir_server_impl_id.count (0) */);
-	if (!p)
+static __be32
+nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 status;
+
+	/* so_minor_id */
+	status = nfsd4_encode_uint64_t(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	/* so_major_id */
+	return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name));
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 union nfsd4_op_u *u)
+{
+	struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+	struct nfsd4_exchange_id *exid = &u->exchange_id;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* eir_clientid */
+	nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_sequenceid */
+	nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, exid->flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_state_protect */
+	nfserr = nfsd4_encode_state_protect4_r(xdr, exid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_owner */
+	nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_scope */
+	nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name,
+				     strlen(nn->nfsd_name));
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* eir_server_impl_id<1> */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
 		return nfserr_resource;
 
-	/* The server_owner struct */
-	p = xdr_encode_hyper(p, minor_id);      /* Minor id */
-	/* major id */
-	p = xdr_encode_opaque(p, major_id, major_id_sz);
+	return nfs_ok;
+}
 
-	/* Server scope */
-	p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+static __be32
+nfsd4_encode_channel_attrs4(struct xdr_stream *xdr,
+			    const struct nfsd4_channel_attrs *attrs)
+{
+	__be32 status;
 
-	/* Implementation id */
-	*p++ = cpu_to_be32(0);	/* zero length nfs_impl_id4 array */
-	return 0;
+	/* ca_headerpadsize */
+	status = nfsd4_encode_count4(xdr, 0);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxrequestsize */
+	status = nfsd4_encode_count4(xdr, attrs->maxreq_sz);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxresponsesize */
+	status = nfsd4_encode_count4(xdr, attrs->maxresp_sz);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxresponsesize_cached */
+	status = nfsd4_encode_count4(xdr, attrs->maxresp_cached);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxoperations */
+	status = nfsd4_encode_count4(xdr, attrs->maxops);
+	if (status != nfs_ok)
+		return status;
+	/* ca_maxrequests */
+	status = nfsd4_encode_count4(xdr, attrs->maxreqs);
+	if (status != nfs_ok)
+		return status;
+	/* ca_rdma_ird<1> */
+	if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT)
+		return nfserr_resource;
+	if (attrs->nr_rdma_attrs)
+		return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs);
+	return nfs_ok;
 }
 
 static __be32
@@ -4576,52 +4879,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_create_session *sess = &u->create_session;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 24);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(sess->seqid);
-	*p++ = cpu_to_be32(sess->flags);
 
-	p = xdr_reserve_space(xdr, 28);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(0); /* headerpadsz */
-	*p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
-	*p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
-	*p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
-	*p++ = cpu_to_be32(sess->fore_channel.maxops);
-	*p++ = cpu_to_be32(sess->fore_channel.maxreqs);
-	*p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
-
-	if (sess->fore_channel.nr_rdma_attrs) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
-	}
-
-	p = xdr_reserve_space(xdr, 28);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(0); /* headerpadsz */
-	*p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
-	*p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
-	*p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
-	*p++ = cpu_to_be32(sess->back_channel.maxops);
-	*p++ = cpu_to_be32(sess->back_channel.maxreqs);
-	*p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
-
-	if (sess->back_channel.nr_rdma_attrs) {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
-	}
-	return 0;
+	/* csr_sessionid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_sequence */
+	nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, sess->flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_fore_chan_attrs */
+	nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* csr_back_chan_attrs */
+	return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel);
 }
 
 static __be32
@@ -4630,22 +4906,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_sequence *seq = &u->sequence;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
-	if (!p)
-		return nfserr_resource;
-	p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
-					NFS4_MAX_SESSIONID_LEN);
-	*p++ = cpu_to_be32(seq->seqid);
-	*p++ = cpu_to_be32(seq->slotid);
+	/* sr_sessionid */
+	nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_sequenceid */
+	nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->slotid);
+	if (nfserr != nfs_ok)
+		return nfserr;
 	/* Note slotid's are numbered from zero: */
-	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
-	*p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
-	*p++ = cpu_to_be32(seq->status_flags);
+	/* sr_highest_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_target_highest_slotid */
+	nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_status_flags */
+	nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags);
+	if (nfserr != nfs_ok)
+		return nfserr;
 
 	resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
-	return 0;
+	return nfs_ok;
 }
 
 static __be32
@@ -4653,125 +4942,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  union nfsd4_op_u *u)
 {
 	struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
-	struct xdr_stream *xdr = resp->xdr;
 	struct nfsd4_test_stateid_id *stateid, *next;
-	__be32 *p;
+	struct xdr_stream *xdr = resp->xdr;
 
-	p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
-	if (!p)
+	/* tsr_status_codes<> */
+	if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT)
 		return nfserr_resource;
-	*p++ = htonl(test_stateid->ts_num_ids);
-
-	list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
-		*p++ = stateid->ts_id_status;
+	list_for_each_entry_safe(stateid, next,
+				 &test_stateid->ts_stateid_list, ts_id_list) {
+		if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT)
+			return nfserr_resource;
 	}
-
-	return 0;
+	return nfs_ok;
 }
 
 #ifdef CONFIG_NFSD_PNFS
 static __be32
-nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
-		union nfsd4_op_u *u)
+nfsd4_encode_device_addr4(struct xdr_stream *xdr,
+			  const struct nfsd4_getdeviceinfo *gdev)
 {
-	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
-	struct xdr_stream *xdr = resp->xdr;
+	u32 needed_len, starting_len = xdr->buf->len;
 	const struct nfsd4_layout_ops *ops;
-	u32 starting_len = xdr->buf->len, needed_len;
-	__be32 *p;
+	__be32 status;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
+	/* da_layout_type */
+	if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT)
 		return nfserr_resource;
-
-	*p++ = cpu_to_be32(gdev->gd_layout_type);
-
+	/* da_addr_body */
 	ops = nfsd4_layout_ops[gdev->gd_layout_type];
-	nfserr = ops->encode_getdeviceinfo(xdr, gdev);
-	if (nfserr) {
+	status = ops->encode_getdeviceinfo(xdr, gdev);
+	if (status != nfs_ok) {
 		/*
-		 * We don't bother to burden the layout drivers with
-		 * enforcing gd_maxcount, just tell the client to
-		 * come back with a bigger buffer if it's not enough.
+		 * Don't burden the layout drivers with enforcing
+		 * gd_maxcount. Just tell the client to come back
+		 * with a bigger buffer if it's not enough.
 		 */
-		if (xdr->buf->len + 4 > gdev->gd_maxcount)
+		if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount)
 			goto toosmall;
-		return nfserr;
+		return status;
 	}
 
-	if (gdev->gd_notify_types) {
-		p = xdr_reserve_space(xdr, 4 + 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = cpu_to_be32(1);			/* bitmap length */
-		*p++ = cpu_to_be32(gdev->gd_notify_types);
-	} else {
-		p = xdr_reserve_space(xdr, 4);
-		if (!p)
-			return nfserr_resource;
-		*p++ = 0;
-	}
+	return nfs_ok;
 
-	return 0;
 toosmall:
-	dprintk("%s: maxcount too small\n", __func__);
-	needed_len = xdr->buf->len + 4 /* notifications */;
+	needed_len = xdr->buf->len + XDR_UNIT;	/* notifications */
 	xdr_truncate_encode(xdr, starting_len);
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(needed_len);
+
+	status = nfsd4_encode_count4(xdr, needed_len);
+	if (status != nfs_ok)
+		return status;
 	return nfserr_toosmall;
 }
 
 static __be32
-nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		union nfsd4_op_u *u)
 {
-	struct nfsd4_layoutget *lgp = &u->layoutget;
+	struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
 	struct xdr_stream *xdr = resp->xdr;
-	const struct nfsd4_layout_ops *ops;
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
-	if (!p)
-		return nfserr_resource;
 
-	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
-	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
-	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
-				    sizeof(stateid_opaque_t));
+	/* gdir_device_addr */
+	nfserr = nfsd4_encode_device_addr4(xdr, gdev);
+	if (nfserr)
+		return nfserr;
+	/* gdir_notification */
+	return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0);
+}
 
-	*p++ = cpu_to_be32(1);	/* we always return a single layout */
-	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
-	p = xdr_encode_hyper(p, lgp->lg_seg.length);
-	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
-	*p++ = cpu_to_be32(lgp->lg_layout_type);
+static __be32
+nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp)
+{
+	const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 status;
 
-	ops = nfsd4_layout_ops[lgp->lg_layout_type];
+	/* lo_offset */
+	status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset);
+	if (status != nfs_ok)
+		return status;
+	/* lo_length */
+	status = nfsd4_encode_length4(xdr, lgp->lg_seg.length);
+	if (status != nfs_ok)
+		return status;
+	/* lo_iomode */
+	if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT)
+		return nfserr_resource;
+	/* lo_content */
+	if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT)
+		return nfserr_resource;
 	return ops->encode_layoutget(xdr, lgp);
 }
 
 static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		union nfsd4_op_u *u)
+{
+	struct nfsd4_layoutget *lgp = &u->layoutget;
+	struct xdr_stream *xdr = resp->xdr;
+
+	/* logr_return_on_close */
+	nfserr = nfsd4_encode_bool(xdr, true);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* logr_stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* logr_layout<> */
+	if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+		return nfserr_resource;
+	return nfsd4_encode_layout4(xdr, lgp);
+}
+
+static __be32
 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  union nfsd4_op_u *u)
 {
 	struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(lcp->lc_size_chg);
-	if (lcp->lc_size_chg) {
-		p = xdr_reserve_space(xdr, 8);
-		if (!p)
-			return nfserr_resource;
-		p = xdr_encode_hyper(p, lcp->lc_newsize);
-	}
-
-	return 0;
+	/* ns_sizechanged */
+	nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	if (lcp->lc_size_chg)
+		/* ns_size */
+		return nfsd4_encode_length4(xdr, lcp->lc_newsize);
+	return nfs_ok;
 }
 
 static __be32
@@ -4780,103 +5076,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-	*p++ = cpu_to_be32(lrp->lrs_present);
+	/* lrs_present */
+	nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present);
+	if (nfserr != nfs_ok)
+		return nfserr;
 	if (lrp->lrs_present)
-		return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
-	return 0;
+		/* lrs_stateid */
+		return nfsd4_encode_stateid4(xdr, &lrp->lr_sid);
+	return nfs_ok;
 }
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
-		struct nfsd42_write_res *write, bool sync)
+nfsd4_encode_write_response4(struct xdr_stream *xdr,
+			     const struct nfsd4_copy *copy)
 {
-	__be32 *p;
-	p = xdr_reserve_space(resp->xdr, 4);
-	if (!p)
-		return nfserr_resource;
+	const struct nfsd42_write_res *write = &copy->cp_res;
+	u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1;
+	__be32 status;
 
-	if (sync)
-		*p++ = cpu_to_be32(0);
-	else {
-		__be32 nfserr;
-		*p++ = cpu_to_be32(1);
-		nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
-		if (nfserr)
-			return nfserr;
+	/* wr_callback_id<1> */
+	if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT)
+		return nfserr_resource;
+	if (count) {
+		status = nfsd4_encode_stateid4(xdr, &write->cb_stateid);
+		if (status != nfs_ok)
+			return status;
 	}
-	p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
-	if (!p)
+
+	/* wr_count */
+	status = nfsd4_encode_length4(xdr, write->wr_bytes_written);
+	if (status != nfs_ok)
+		return status;
+	/* wr_committed */
+	if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT)
 		return nfserr_resource;
+	/* wr_writeverf */
+	return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
+}
 
-	p = xdr_encode_hyper(p, write->wr_bytes_written);
-	*p++ = cpu_to_be32(write->wr_stable_how);
-	p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
-				    NFS4_VERIFIER_SIZE);
-	return nfs_ok;
+static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr,
+					      const struct nfsd4_copy *copy)
+{
+	__be32 status;
+
+	/* cr_consecutive */
+	status = nfsd4_encode_bool(xdr, true);
+	if (status != nfs_ok)
+		return status;
+	/* cr_synchronous */
+	return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy));
 }
 
 static __be32
-nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  union nfsd4_op_u *u)
 {
-	struct xdr_stream *xdr = resp->xdr;
-	struct nfs42_netaddr *addr;
-	__be32 *p;
+	struct nfsd4_copy *copy = &u->copy;
 
-	p = xdr_reserve_space(xdr, 4);
-	*p++ = cpu_to_be32(ns->nl4_type);
+	nfserr = nfsd4_encode_write_response4(resp->xdr, copy);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	return nfsd4_encode_copy_requirements4(resp->xdr, copy);
+}
 
+static __be32
+nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns)
+{
+	__be32 status;
+
+	if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT)
+		return nfserr_resource;
 	switch (ns->nl4_type) {
 	case NL4_NETADDR:
-		addr = &ns->u.nl4_addr;
-
-		/* netid_len, netid, uaddr_len, uaddr (port included
-		 * in RPCBIND_MAXUADDRLEN)
-		 */
-		p = xdr_reserve_space(xdr,
-			4 /* netid len */ +
-			(XDR_QUADLEN(addr->netid_len) * 4) +
-			4 /* uaddr len */ +
-			(XDR_QUADLEN(addr->addr_len) * 4));
-		if (!p)
-			return nfserr_resource;
-
-		*p++ = cpu_to_be32(addr->netid_len);
-		p = xdr_encode_opaque_fixed(p, addr->netid,
-					    addr->netid_len);
-		*p++ = cpu_to_be32(addr->addr_len);
-		p = xdr_encode_opaque_fixed(p, addr->addr,
-					addr->addr_len);
+		/* nl_addr */
+		status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr);
 		break;
 	default:
-		WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR);
-		return nfserr_inval;
+		status = nfserr_serverfault;
 	}
-
-	return 0;
+	return status;
 }
 
 static __be32
-nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
-		  union nfsd4_op_u *u)
+nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+			 union nfsd4_op_u *u)
 {
-	struct nfsd4_copy *copy = &u->copy;
-	__be32 *p;
+	struct nfsd4_copy_notify *cn = &u->copy_notify;
+	struct xdr_stream *xdr = resp->xdr;
 
-	nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
-					 nfsd4_copy_is_sync(copy));
+	/* cnr_lease_time */
+	nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time);
 	if (nfserr)
 		return nfserr;
-
-	p = xdr_reserve_space(resp->xdr, 4 + 4);
-	*p++ = xdr_one; /* cr_consecutive */
-	*p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
-	return 0;
+	/* cnr_stateid */
+	nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid);
+	if (nfserr)
+		return nfserr;
+	/* cnr_source_server<> */
+	if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+		return nfserr_resource;
+	return nfsd4_encode_netloc4(xdr, cn->cpn_src);
 }
 
 static __be32
@@ -4885,14 +5186,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	struct nfsd4_offload_status *os = &u->offload_status;
 	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
 
-	p = xdr_reserve_space(xdr, 8 + 4);
-	if (!p)
+	/* osr_count */
+	nfserr = nfsd4_encode_length4(xdr, os->count);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* osr_complete<1> */
+	if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
 		return nfserr_resource;
-	p = xdr_encode_hyper(p, os->count);
-	*p++ = cpu_to_be32(0);
-	return nfserr;
+	return nfs_ok;
 }
 
 static __be32
@@ -4970,53 +5272,18 @@ out:
 }
 
 static __be32
-nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
-			 union nfsd4_op_u *u)
-{
-	struct nfsd4_copy_notify *cn = &u->copy_notify;
-	struct xdr_stream *xdr = resp->xdr;
-	__be32 *p;
-
-	if (nfserr)
-		return nfserr;
-
-	/* 8 sec, 4 nsec */
-	p = xdr_reserve_space(xdr, 12);
-	if (!p)
-		return nfserr_resource;
-
-	/* cnr_lease_time */
-	p = xdr_encode_hyper(p, cn->cpn_sec);
-	*p++ = cpu_to_be32(cn->cpn_nsec);
-
-	/* cnr_stateid */
-	nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid);
-	if (nfserr)
-		return nfserr;
-
-	/* cnr_src.nl_nsvr */
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-
-	*p++ = cpu_to_be32(1);
-
-	nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
-	return nfserr;
-}
-
-static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  union nfsd4_op_u *u)
 {
 	struct nfsd4_seek *seek = &u->seek;
-	__be32 *p;
-
-	p = xdr_reserve_space(resp->xdr, 4 + 8);
-	*p++ = cpu_to_be32(seek->seek_eof);
-	p = xdr_encode_hyper(p, seek->seek_pos);
+	struct xdr_stream *xdr = resp->xdr;
 
-	return 0;
+	/* sr_eof */
+	nfserr = nfsd4_encode_bool(xdr, seek->seek_eof);
+	if (nfserr != nfs_ok)
+		return nfserr;
+	/* sr_offset */
+	return nfsd4_encode_offset4(xdr, seek->seek_pos);
 }
 
 static __be32
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 80621a709510..d3273a396659 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -201,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
 {
 	unsigned int hashsize;
 	unsigned int i;
-	int status = 0;
 
 	nn->max_drc_entries = nfsd_cache_size_limit();
 	atomic_set(&nn->num_drc_entries, 0);
 	hashsize = nfsd_hashsize(nn->max_drc_entries);
 	nn->maskbits = ilog2(hashsize);
 
-	nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
-	nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
-	nn->nfsd_reply_cache_shrinker.seeks = 1;
-	status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
-				   "nfsd-reply:%s", nn->nfsd_name);
-	if (status)
-		return status;
-
 	nn->drc_hashtbl = kvzalloc(array_size(hashsize,
 				sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
 	if (!nn->drc_hashtbl)
+		return -ENOMEM;
+
+	nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s",
+						       nn->nfsd_name);
+	if (!nn->nfsd_reply_cache_shrinker)
 		goto out_shrinker;
 
+	nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan;
+	nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count;
+	nn->nfsd_reply_cache_shrinker->seeks = 1;
+	nn->nfsd_reply_cache_shrinker->private_data = nn;
+
+	shrinker_register(nn->nfsd_reply_cache_shrinker);
+
 	for (i = 0; i < hashsize; i++) {
 		INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head);
 		spin_lock_init(&nn->drc_hashtbl[i].cache_lock);
@@ -229,7 +232,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
 
 	return 0;
 out_shrinker:
-	unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+	kvfree(nn->drc_hashtbl);
 	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
 	return -ENOMEM;
 }
@@ -239,7 +242,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
 	struct nfsd_cacherep *rp;
 	unsigned int i;
 
-	unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+	shrinker_free(nn->nfsd_reply_cache_shrinker);
 
 	for (i = 0; i < nn->drc_hashsize; i++) {
 		struct list_head *head = &nn->drc_hashtbl[i].lru_head;
@@ -323,8 +326,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
 static unsigned long
 nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct nfsd_net *nn = container_of(shrink,
-				struct nfsd_net, nfsd_reply_cache_shrinker);
+	struct nfsd_net *nn = shrink->private_data;
 
 	return atomic_read(&nn->num_drc_entries);
 }
@@ -343,8 +345,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 static unsigned long
 nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct nfsd_net *nn = container_of(shrink,
-				struct nfsd_net, nfsd_reply_cache_shrinker);
+	struct nfsd_net *nn = shrink->private_data;
 	unsigned long freed = 0;
 	LIST_HEAD(dispose);
 	unsigned int i;
@@ -368,33 +369,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 	return freed;
 }
 
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
  */
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+			      unsigned int remaining)
 {
+	unsigned int base, len;
+	struct xdr_buf subbuf;
+	__wsum csum = 0;
+	void *p;
 	int idx;
-	unsigned int base;
-	__wsum csum;
-	struct xdr_buf *buf = &rqstp->rq_arg;
-	const unsigned char *p = buf->head[0].iov_base;
-	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
-				RC_CSUMLEN);
-	size_t len = min(buf->head[0].iov_len, csum_len);
+
+	if (remaining > RC_CSUMLEN)
+		remaining = RC_CSUMLEN;
+	if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+		return csum;
 
 	/* rq_arg.head first */
-	csum = csum_partial(p, len, 0);
-	csum_len -= len;
+	if (subbuf.head[0].iov_len) {
+		len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+		csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+		remaining -= len;
+	}
 
 	/* Continue into page array */
-	idx = buf->page_base / PAGE_SIZE;
-	base = buf->page_base & ~PAGE_MASK;
-	while (csum_len) {
-		p = page_address(buf->pages[idx]) + base;
-		len = min_t(size_t, PAGE_SIZE - base, csum_len);
+	idx = subbuf.page_base / PAGE_SIZE;
+	base = subbuf.page_base & ~PAGE_MASK;
+	while (remaining) {
+		p = page_address(subbuf.pages[idx]) + base;
+		len = min_t(unsigned int, PAGE_SIZE - base, remaining);
 		csum = csum_partial(p, len, csum);
-		csum_len -= len;
+		remaining -= len;
 		base = 0;
 		++idx;
 	}
@@ -465,6 +485,8 @@ out:
 /**
  * nfsd_cache_lookup - Find an entry in the duplicate reply cache
  * @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
  * @cacherep: OUT: DRC entry for this request
  *
  * Try to find an entry matching the current call in the cache. When none
@@ -478,7 +500,8 @@ out:
  *   %RC_REPLY: Reply from cache
  *   %RC_DROPIT: Do not process the request further
  */
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+		      unsigned int len, struct nfsd_cacherep **cacherep)
 {
 	struct nfsd_net		*nn;
 	struct nfsd_cacherep	*rp, *found;
@@ -494,7 +517,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
 		goto out;
 	}
 
-	csum = nfsd_cache_csum(rqstp);
+	csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
 
 	/*
 	 * Since the common case is a cache miss followed by an insert,
@@ -640,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
 	return;
 }
 
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
 static int
 nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 {
-	struct kvec	*vec = &rqstp->rq_res.head[0];
-
-	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
-		printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
-				data->iov_len);
-		return 0;
-	}
-	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
-	vec->iov_len += data->iov_len;
-	return 1;
+	__be32 *p;
+
+	p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+	if (unlikely(!p))
+		return false;
+	memcpy(p, data->iov_base, data->iov_len);
+	xdr_commit_encode(&rqstp->rq_res_stream);
+	return true;
 }
 
 /*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ed02fb88a36..87fed75808ff 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -26,6 +26,7 @@
 #include "pnfs.h"
 #include "filecache.h"
 #include "trace.h"
+#include "netlink.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -692,6 +693,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
 	char *mesg = buf;
 	int fd, err;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct svc_serv *serv;
 
 	err = get_int(&mesg, &fd);
 	if (err != 0 || fd < 0)
@@ -702,13 +704,15 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
 	if (err != 0)
 		return err;
 
-	err = svc_addsock(nn->nfsd_serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
+	serv = nn->nfsd_serv;
+	err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
 
-	if (err >= 0 &&
-	    !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
-		svc_get(nn->nfsd_serv);
+	if (err < 0 && !serv->sv_nrthreads && !nn->keep_active)
+		nfsd_last_thread(net);
+	else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+		svc_get(serv);
 
-	nfsd_put(net);
+	svc_put(serv);
 	return err;
 }
 
@@ -722,6 +726,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
 	struct svc_xprt *xprt;
 	int port, err;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct svc_serv *serv;
 
 	if (sscanf(buf, "%15s %5u", transport, &port) != 2)
 		return -EINVAL;
@@ -734,29 +739,33 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
 	if (err != 0)
 		return err;
 
-	err = svc_xprt_create(nn->nfsd_serv, transport, net,
+	serv = nn->nfsd_serv;
+	err = svc_xprt_create(serv, transport, net,
 			      PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
 	if (err < 0)
 		goto out_err;
 
-	err = svc_xprt_create(nn->nfsd_serv, transport, net,
+	err = svc_xprt_create(serv, transport, net,
 			      PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
 	if (err < 0 && err != -EAFNOSUPPORT)
 		goto out_close;
 
-	if (!nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
-		svc_get(nn->nfsd_serv);
+	if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
+		svc_get(serv);
 
-	nfsd_put(net);
+	svc_put(serv);
 	return 0;
 out_close:
-	xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
+	xprt = svc_find_xprt(serv, transport, net, PF_INET, port);
 	if (xprt != NULL) {
 		svc_xprt_close(xprt);
 		svc_xprt_put(xprt);
 	}
 out_err:
-	nfsd_put(net);
+	if (!serv->sv_nrthreads && !nn->keep_active)
+		nfsd_last_thread(net);
+
+	svc_put(serv);
 	return err;
 }
 
@@ -1132,7 +1141,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
 	/* Following advice from simple_fill_super documentation: */
 	inode->i_ino = iunique(sb, NFSD_MaxReserved);
 	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 		inode->i_fop = &simple_dir_operations;
@@ -1496,6 +1505,200 @@ static int create_proc_exports_entry(void)
 unsigned int nfsd_net_id;
 
 /**
+ * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: The rpc_status_get command may proceed
+ *   %-ENODEV: There is no NFSD running in this namespace
+ */
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
+{
+	struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+	int ret = -ENODEV;
+
+	mutex_lock(&nfsd_mutex);
+	if (nn->nfsd_serv)
+		ret = 0;
+	else
+		mutex_unlock(&nfsd_mutex);
+
+	return ret;
+}
+
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+					    struct netlink_callback *cb,
+					    struct nfsd_genl_rqstp *rqstp)
+{
+	void *hdr;
+	u32 i;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
+	    nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
+	    nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+	    nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
+			ktime_to_us(rqstp->rq_stime),
+			NFSD_A_RPC_STATUS_PAD))
+		return -ENOBUFS;
+
+	switch (rqstp->rq_saddr.sa_family) {
+	case AF_INET: {
+		const struct sockaddr_in *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+		if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
+				    s_in->sin_addr.s_addr) ||
+		    nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
+				    d_in->sin_addr.s_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin_port))
+			return -ENOBUFS;
+		break;
+	}
+	case AF_INET6: {
+		const struct sockaddr_in6 *s_in, *d_in;
+
+		s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
+		d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+		if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
+				     &s_in->sin6_addr) ||
+		    nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
+				     &d_in->sin6_addr) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+				 s_in->sin6_port) ||
+		    nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+				 d_in->sin6_port))
+			return -ENOBUFS;
+		break;
+	}
+	}
+
+	for (i = 0; i < rqstp->rq_opcnt; i++)
+		if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
+				rqstp->rq_opnum[i]))
+			return -ENOBUFS;
+
+	genlmsg_end(skb, hdr);
+	return 0;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit
+ * @skb: reply buffer
+ * @cb: netlink metadata and command arguments
+ *
+ * Returns the size of the reply or a negative errno.
+ */
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+	int i, ret, rqstp_index = 0;
+
+	rcu_read_lock();
+
+	for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+		struct svc_rqst *rqstp;
+
+		if (i < cb->args[0]) /* already consumed */
+			continue;
+
+		rqstp_index = 0;
+		list_for_each_entry_rcu(rqstp,
+				&nn->nfsd_serv->sv_pools[i].sp_all_threads,
+				rq_all) {
+			struct nfsd_genl_rqstp genl_rqstp;
+			unsigned int status_counter;
+
+			if (rqstp_index++ < cb->args[1]) /* already consumed */
+				continue;
+			/*
+			 * Acquire rq_status_counter before parsing the rqst
+			 * fields. rq_status_counter is set to an odd value in
+			 * order to notify the consumers the rqstp fields are
+			 * meaningful.
+			 */
+			status_counter =
+				smp_load_acquire(&rqstp->rq_status_counter);
+			if (!(status_counter & 1))
+				continue;
+
+			genl_rqstp.rq_xid = rqstp->rq_xid;
+			genl_rqstp.rq_flags = rqstp->rq_flags;
+			genl_rqstp.rq_vers = rqstp->rq_vers;
+			genl_rqstp.rq_prog = rqstp->rq_prog;
+			genl_rqstp.rq_proc = rqstp->rq_proc;
+			genl_rqstp.rq_stime = rqstp->rq_stime;
+			genl_rqstp.rq_opcnt = 0;
+			memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp),
+			       sizeof(struct sockaddr));
+			memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp),
+			       sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+			if (rqstp->rq_vers == NFS4_VERSION &&
+			    rqstp->rq_proc == NFSPROC4_COMPOUND) {
+				/* NFSv4 compound */
+				struct nfsd4_compoundargs *args;
+				int j;
+
+				args = rqstp->rq_argp;
+				genl_rqstp.rq_opcnt = args->opcnt;
+				for (j = 0; j < genl_rqstp.rq_opcnt; j++)
+					genl_rqstp.rq_opnum[j] =
+						args->ops[j].opnum;
+			}
+#endif /* CONFIG_NFSD_V4 */
+
+			/*
+			 * Acquire rq_status_counter before reporting the rqst
+			 * fields to the user.
+			 */
+			if (smp_load_acquire(&rqstp->rq_status_counter) !=
+			    status_counter)
+				continue;
+
+			ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+							       &genl_rqstp);
+			if (ret)
+				goto out;
+		}
+	}
+
+	cb->args[0] = i;
+	cb->args[1] = rqstp_index;
+	ret = skb->len;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ *   %0: Success
+ */
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
+{
+	mutex_unlock(&nfsd_mutex);
+
+	return 0;
+}
+
+/**
  * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
  * @net: a freshly-created network namespace
  *
@@ -1589,6 +1792,10 @@ static int __init init_nfsd(void)
 	retval = register_filesystem(&nfsd_fs_type);
 	if (retval)
 		goto out_free_all;
+	retval = genl_register_family(&nfsd_nl_family);
+	if (retval)
+		goto out_free_all;
+
 	return 0;
 out_free_all:
 	nfsd4_destroy_laundry_wq();
@@ -1613,6 +1820,7 @@ out_free_slabs:
 
 static void __exit exit_nfsd(void)
 {
+	genl_unregister_family(&nfsd_nl_family);
 	unregister_filesystem(&nfsd_fs_type);
 	nfsd4_destroy_laundry_wq();
 	unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..9ed0e08d16c2 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,23 @@ struct readdir_cd {
 	__be32			err;	/* 0, nfserr, or nfserr_eof */
 };
 
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND	50
+
+struct nfsd_genl_rqstp {
+	struct sockaddr		rq_daddr;
+	struct sockaddr		rq_saddr;
+	unsigned long		rq_flags;
+	ktime_t			rq_stime;
+	__be32			rq_xid;
+	u32			rq_vers;
+	u32			rq_prog;
+	u32			rq_proc;
+
+	/* NFSv4 compound */
+	u32			rq_opcnt;
+	u32			rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+};
 
 extern struct svc_program	nfsd_program;
 extern const struct svc_version	nfsd_version2, nfsd_version3, nfsd_version4;
@@ -96,13 +113,6 @@ int		nfsd_pool_stats_open(struct inode *, struct file *);
 int		nfsd_pool_stats_release(struct inode *, struct file *);
 void		nfsd_shutdown_threads(struct net *net);
 
-static inline void nfsd_put(struct net *net)
-{
-	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
-	svc_put(nn->nfsd_serv);
-}
-
 bool		i_am_nfsd(void);
 
 struct nfsdfs_client {
@@ -138,6 +148,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
 int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
 void nfsd_reset_versions(struct nfsd_net *nn);
 int nfsd_create_serv(struct net *net);
+void nfsd_last_thread(struct net *net);
 
 extern int nfsd_max_blksize;
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 355bf0db3235..dbfa0ac13564 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
  * assume that the new change attr is always logged to stable storage in some
  * fashion before the results can be seen.
  */
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode)
 {
 	u64 chattr;
 
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 40426f899e76..6ebdf7ea27bf 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
 	fhp->fh_pre_saved = false;
 }
 
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
+u64 nfsd4_change_attribute(const struct kstat *stat,
+			   const struct inode *inode);
 __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
 __be32 fh_fill_post_attrs(struct svc_fh *fhp);
 __be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index c7af1095f6b5..7a2bc8e82a63 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -542,7 +542,7 @@ static struct notifier_block nfsd_inet6addr_notifier = {
 /* Only used under nfsd_mutex, so this atomic may be overkill: */
 static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
 
-static void nfsd_last_thread(struct net *net)
+void nfsd_last_thread(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct svc_serv *serv = nn->nfsd_serv;
@@ -572,7 +572,6 @@ static void nfsd_last_thread(struct net *net)
 		return;
 
 	nfsd_shutdown_net(net);
-	pr_info("nfsd: last server has exited, flushing export cache\n");
 	nfsd_export_flush(net);
 }
 
@@ -713,14 +712,13 @@ int nfsd_nrpools(struct net *net)
 
 int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
 {
-	int i = 0;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct svc_serv *serv = nn->nfsd_serv;
+	int i;
 
-	if (nn->nfsd_serv != NULL) {
-		for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
-			nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
-	}
-
+	if (serv)
+		for (i = 0; i < serv->sv_nrpools && i < n; i++)
+			nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
 	return 0;
 }
 
@@ -787,7 +785,6 @@ int
 nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 {
 	int	error;
-	bool	nfsd_up_before;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct svc_serv *serv;
 
@@ -807,8 +804,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 	error = nfsd_create_serv(net);
 	if (error)
 		goto out;
-
-	nfsd_up_before = nn->nfsd_net_up;
 	serv = nn->nfsd_serv;
 
 	error = nfsd_startup_net(net, cred);
@@ -816,17 +811,15 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 		goto out_put;
 	error = svc_set_num_threads(serv, NULL, nrservs);
 	if (error)
-		goto out_shutdown;
+		goto out_put;
 	error = serv->sv_nrthreads;
-	if (error == 0)
-		nfsd_last_thread(net);
-out_shutdown:
-	if (error < 0 && !nfsd_up_before)
-		nfsd_shutdown_net(net);
 out_put:
 	/* Threads now hold service active */
 	if (xchg(&nn->keep_active, 0))
 		svc_put(serv);
+
+	if (serv->sv_nrthreads == 0)
+		nfsd_last_thread(net);
 	svc_put(serv);
 out:
 	mutex_unlock(&nfsd_mutex);
@@ -957,12 +950,11 @@ nfsd(void *vrqstp)
 	/*
 	 * The main request loop
 	 */
-	while (!kthread_should_stop()) {
+	while (!svc_thread_should_stop(rqstp)) {
 		/* Update sv_maxconn if it has changed */
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
-		validate_process_creds();
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);
@@ -988,6 +980,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	const struct svc_procedure *proc = rqstp->rq_procinfo;
 	__be32 *statp = rqstp->rq_accept_statp;
 	struct nfsd_cacherep *rp;
+	unsigned int start, len;
+	__be32 *nfs_reply;
 
 	/*
 	 * Give the xdr decoder a chance to change this if it wants
@@ -995,11 +989,27 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	 */
 	rqstp->rq_cachetype = proc->pc_cachetype;
 
+	/*
+	 * ->pc_decode advances the argument stream past the NFS
+	 * Call header, so grab the header's starting location and
+	 * size now for the call to nfsd_cache_lookup().
+	 */
+	start = xdr_stream_pos(&rqstp->rq_arg_stream);
+	len = xdr_stream_remaining(&rqstp->rq_arg_stream);
 	if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
 		goto out_decode_err;
 
+	/*
+	 * Release rq_status_counter setting it to an odd value after the rpc
+	 * request has been properly parsed. rq_status_counter is used to
+	 * notify the consumers if the rqstp fields are stable
+	 * (rq_status_counter is odd) or not meaningful (rq_status_counter
+	 * is even).
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
 	rp = NULL;
-	switch (nfsd_cache_lookup(rqstp, &rp)) {
+	switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
 	case RC_DOIT:
 		break;
 	case RC_REPLY:
@@ -1008,6 +1018,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 		goto out_dropit;
 	}
 
+	nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
 	*statp = proc->pc_func(rqstp);
 	if (test_bit(RQ_DROPME, &rqstp->rq_flags))
 		goto out_update_drop;
@@ -1015,7 +1026,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
 	if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
 		goto out_encode_err;
 
-	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+	/*
+	 * Release rq_status_counter setting it to an even value after the rpc
+	 * request has been properly processed.
+	 */
+	smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
+	nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
 out_cached_reply:
 	return 1;
 
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 4f4282d4eeca..de1e0dfed06a 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -27,12 +27,12 @@ struct nfsd4_layout_ops {
 			struct nfs4_client *clp,
 			struct nfsd4_getdeviceinfo *gdevp);
 	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
-			struct nfsd4_getdeviceinfo *gdevp);
+			const struct nfsd4_getdeviceinfo *gdevp);
 
 	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
 			struct nfsd4_layoutget *lgp);
-	__be32 (*encode_layoutget)(struct xdr_stream *,
-			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *xdr,
+			const struct nfsd4_layoutget *lgp);
 
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cbddcf484dba..41bdc913fa71 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND	50
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 63797635e1c3..12d79f5d4eb1 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -60,7 +60,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
 	/* Writing operation numbers 0 1 2 also for maintaining uniformity */
-	seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+	seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
 	for (i = 0; i <= LAST_NFS4_OP; i++) {
 		seq_printf(seq, " %lld",
 			   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
@@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 
 DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num)
 {
 	int i, err = 0;
 
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index cf5524e7ca06..14f50c660b61 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -37,9 +37,9 @@ extern struct nfsd_stats	nfsdstats;
 
 extern struct svc_stat		nfsd_svcstats;
 
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
 int nfsd_stat_init(void);
 void nfsd_stat_shutdown(void);
 
@@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void)
 static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
 {
 	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
-	if (exp)
-		percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
+	if (exp && exp->ex_stats)
+		percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
 }
 
 static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
 {
 	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
-	if (exp)
-		percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
+	if (exp && exp->ex_stats)
+		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
 }
 
 static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
 {
 	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
-	if (exp)
-		percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
+	if (exp && exp->ex_stats)
+		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
 }
 
 static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 803904348871..fbc0ccb40424 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1863,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace,
 	)
 );
 
+DECLARE_EVENT_CLASS(nfsd_copy_class,
+	TP_PROTO(
+		const struct nfsd4_copy *copy
+	),
+	TP_ARGS(copy),
+	TP_STRUCT__entry(
+		__field(bool, intra)
+		__field(bool, async)
+		__field(u32, src_cl_boot)
+		__field(u32, src_cl_id)
+		__field(u32, src_so_id)
+		__field(u32, src_si_generation)
+		__field(u32, dst_cl_boot)
+		__field(u32, dst_cl_id)
+		__field(u32, dst_so_id)
+		__field(u32, dst_si_generation)
+		__field(u64, src_cp_pos)
+		__field(u64, dst_cp_pos)
+		__field(u64, cp_count)
+		__sockaddr(addr, sizeof(struct sockaddr_in6))
+	),
+	TP_fast_assign(
+		const stateid_t *src_stp = &copy->cp_src_stateid;
+		const stateid_t *dst_stp = &copy->cp_dst_stateid;
+
+		__entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+		__entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+		__entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot;
+		__entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id;
+		__entry->src_so_id = src_stp->si_opaque.so_id;
+		__entry->src_si_generation = src_stp->si_generation;
+		__entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot;
+		__entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
+		__entry->dst_so_id = dst_stp->si_opaque.so_id;
+		__entry->dst_si_generation = dst_stp->si_generation;
+		__entry->src_cp_pos = copy->cp_src_pos;
+		__entry->dst_cp_pos = copy->cp_dst_pos;
+		__entry->cp_count = copy->cp_count;
+		__assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+				sizeof(struct sockaddr_in6));
+	),
+	TP_printk("client=%pISpc intra=%d async=%d "
+		"src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+		"dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+		"cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
+		__get_sockaddr(addr), __entry->intra, __entry->async,
+		__entry->src_si_generation, __entry->src_cl_boot,
+		__entry->src_cl_id, __entry->src_so_id,
+		__entry->dst_si_generation, __entry->dst_cl_boot,
+		__entry->dst_cl_id, __entry->dst_so_id,
+		__entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
+	)
+);
+
+#define DEFINE_COPY_EVENT(name)				\
+DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name,	\
+	TP_PROTO(const struct nfsd4_copy *copy),	\
+	TP_ARGS(copy))
+
+DEFINE_COPY_EVENT(inter);
+DEFINE_COPY_EVENT(intra);
+DEFINE_COPY_EVENT(do_async);
+
+TRACE_EVENT(nfsd_copy_done,
+	TP_PROTO(
+		const struct nfsd4_copy *copy,
+		__be32 status
+	),
+	TP_ARGS(copy, status),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(bool, intra)
+		__field(bool, async)
+		__sockaddr(addr, sizeof(struct sockaddr_in6))
+	),
+	TP_fast_assign(
+		__entry->status = be32_to_cpu(status);
+		__entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+		__entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+		__assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+				sizeof(struct sockaddr_in6));
+	),
+	TP_printk("addr=%pISpc status=%d intra=%d async=%d ",
+		__get_sockaddr(addr), __entry->status, __entry->intra, __entry->async
+	)
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 02f5fcaad03f..e01e4e2acbd9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -337,6 +337,24 @@ out:
 	return err;
 }
 
+static void
+commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
+			    int err)
+{
+	switch (err) {
+	case -EAGAIN:
+	case -ESTALE:
+		/*
+		 * Neither of these are the result of a problem with
+		 * durable storage, so avoid a write verifier reset.
+		 */
+		break;
+	default:
+		nfsd_reset_write_verifier(nn);
+		trace_nfsd_writeverf_reset(nn, rqstp, err);
+	}
+}
+
 /*
  * Commit metadata changes to stable storage.
  */
@@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	nfsd_sanitize_attrs(inode, iap);
 
-	if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
+	if (check_guard && guardtime != inode_get_ctime_sec(inode))
 		return nfserr_notsync;
 
 	/*
@@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
 					&nfsd4_get_cstate(rqstp)->current_fh,
 					dst_pos,
 					count, status);
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, status);
+			commit_reset_write_verifier(nn, rqstp, status);
 			ret = nfserrno(status);
 		}
 	}
@@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access)
  * and additional flags.
  * N.B. After this call fhp needs an fh_put
  */
-static __be32
+static int
 __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 			int may_flags, struct file **filp)
 {
@@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	struct inode	*inode;
 	struct file	*file;
 	int		flags = O_RDONLY|O_LARGEFILE;
-	__be32		err;
-	int		host_err = 0;
+	int		host_err = -EPERM;
 
 	path.mnt = fhp->fh_export->ex_path.mnt;
 	path.dentry = fhp->fh_dentry;
 	inode = d_inode(path.dentry);
 
-	err = nfserr_perm;
 	if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
 		goto out;
 
@@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 
 	host_err = nfsd_open_break_lease(inode, may_flags);
 	if (host_err) /* NOMEM or WOULDBLOCK */
-		goto out_nfserr;
+		goto out;
 
 	if (may_flags & NFSD_MAY_WRITE) {
 		if (may_flags & NFSD_MAY_READ)
@@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	file = dentry_open(&path, flags, current_cred());
 	if (IS_ERR(file)) {
 		host_err = PTR_ERR(file);
-		goto out_nfserr;
+		goto out;
 	}
 
 	host_err = ima_file_check(file, may_flags);
 	if (host_err) {
 		fput(file);
-		goto out_nfserr;
+		goto out;
 	}
 
 	if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		file->f_mode |= FMODE_32BITHASH;
 
 	*filp = file;
-out_nfserr:
-	err = nfserrno(host_err);
 out:
-	return err;
+	return host_err;
 }
 
 __be32
@@ -885,9 +898,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		int may_flags, struct file **filp)
 {
 	__be32 err;
+	int host_err;
 	bool retried = false;
 
-	validate_process_creds();
 	/*
 	 * If we get here, then the client has already done an "open",
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -904,14 +917,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 retry:
 	err = fh_verify(rqstp, fhp, type, may_flags);
 	if (!err) {
-		err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
-		if (err == nfserr_stale && !retried) {
+		host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+		if (host_err == -EOPENSTALE && !retried) {
 			retried = true;
 			fh_put(fhp);
 			goto retry;
 		}
+		err = nfserrno(host_err);
 	}
-	validate_process_creds();
 	return err;
 }
 
@@ -922,18 +935,13 @@ retry:
  * @may_flags: internal permission flags
  * @filp: OUT: open "struct file *"
  *
- * Returns an nfsstat value in network byte order.
+ * Returns zero on success, or a negative errno value.
  */
-__be32
+int
 nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
 		   struct file **filp)
 {
-	__be32 err;
-
-	validate_process_creds();
-	err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
-	validate_process_creds();
-	return err;
+	return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
 }
 
 /*
@@ -1172,8 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	host_err = vfs_iter_write(file, &iter, &pos, flags);
 	file_end_write(file);
 	if (host_err < 0) {
-		nfsd_reset_write_verifier(nn);
-		trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+		commit_reset_write_verifier(nn, rqstp, host_err);
 		goto out_nfserr;
 	}
 	*cnt = host_err;
@@ -1185,10 +1192,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 
 	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
-		if (host_err < 0) {
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, host_err);
-		}
+		if (host_err < 0)
+			commit_reset_write_verifier(nn, rqstp, host_err);
 	}
 
 out_nfserr:
@@ -1331,8 +1336,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 			err = nfserr_notsupp;
 			break;
 		default:
-			nfsd_reset_write_verifier(nn);
-			trace_nfsd_writeverf_reset(nn, rqstp, err2);
+			commit_reset_write_verifier(nn, rqstp, err2);
 			err = nfserrno(err2);
 		}
 	} else
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a6890ea7b765..e3c29596f4df 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -104,8 +104,8 @@ __be32		nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 int 		nfsd_open_break_lease(struct inode *, int);
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
-__be32		nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
-				int, struct file **);
+int		nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				   int may_flags, struct file **filp);
 __be32		nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
 				unsigned long *count,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9d918a79dc16..80e859dc84d8 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -50,6 +50,134 @@
 #define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f))
 #define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f))
 
+/**
+ * nfsd4_encode_bool - Encode an XDR bool type result
+ * @xdr: target XDR stream
+ * @val: boolean value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_bool(struct xdr_stream *xdr, bool val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p = val ? xdr_one : xdr_zero;
+	return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p = cpu_to_be32(val);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_aceflag4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acemask4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acetype4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_count4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_mode4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_nfs_lease4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_qop4(x, v)		nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_sequenceid4(x, v)	nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_slotid4(x, v)	nfsd4_encode_uint32_t(x, v)
+
+/**
+ * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ *    %nfs_ok: @val encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	put_unaligned_be64(val, p);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_changeid4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_nfs_cookie4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_length4(x, v)	nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_offset4(x, v)	nfsd4_encode_uint64_t(x, v)
+
+/**
+ * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ *    %nfs_ok: @data encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data,
+			  size_t size)
+{
+	__be32 *p = xdr_reserve_space(xdr, xdr_align_size(size));
+	size_t pad = xdr_pad_size(size);
+
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	memcpy(p, data, size);
+	if (pad)
+		memset((char *)p + size, 0, pad);
+	return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ *    %nfs_ok: @data encoded; @xdr advanced to next position
+ *    %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size)
+{
+	size_t pad = xdr_pad_size(size);
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size));
+	if (unlikely(p == NULL))
+		return nfserr_resource;
+	*p++ = cpu_to_be32(size);
+	memcpy(p, data, size);
+	if (pad)
+		memset((char *)p + size, 0, pad);
+	return nfs_ok;
+}
+
+#define nfsd4_encode_component4(x, d, s)	nfsd4_encode_opaque(x, d, s)
+
 struct nfsd4_compound_state {
 	struct svc_fh		current_fh;
 	struct svc_fh		save_fh;
@@ -170,12 +298,8 @@ struct nfsd4_lock {
 	} v;
 
 	/* response */
-	union {
-		struct {
-			stateid_t               stateid;
-		} ok;
-		struct nfsd4_lock_denied        denied;
-	} u;
+	stateid_t			lk_resp_stateid;
+	struct nfsd4_lock_denied        lk_denied;
 };
 #define lk_new_open_seqid       v.new.open_seqid
 #define lk_new_open_stateid     v.new.open_stateid
@@ -185,20 +309,15 @@ struct nfsd4_lock {
 #define lk_old_lock_stateid     v.old.lock_stateid
 #define lk_old_lock_seqid       v.old.lock_seqid
 
-#define lk_resp_stateid u.ok.stateid
-#define lk_denied       u.denied
-
-
 struct nfsd4_lockt {
 	u32				lt_type;
 	clientid_t			lt_clientid;
 	struct xdr_netobj		lt_owner;
 	u64				lt_offset;
 	u64				lt_length;
-	struct nfsd4_lock_denied  	lt_denied;
+	struct nfsd4_lock_denied	lt_denied;
 };
 
- 
 struct nfsd4_locku {
 	u32             lu_type;
 	u32             lu_seqid;
@@ -267,9 +386,9 @@ struct nfsd4_open {
 	u32		op_deleg_want;      /* request */
 	stateid_t	op_stateid;         /* response */
 	__be32		op_xdr_error;       /* see nfsd4_open_omfg() */
-	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
+	bool		op_recall;          /* response */
 	bool		op_truncate;        /* used during processing */
 	bool		op_created;         /* used during processing */
 	struct nfs4_openowner *op_openowner; /* used during processing */
@@ -496,7 +615,7 @@ struct nfsd4_layoutcommit {
 	u32			lc_layout_type;	/* request */
 	u32			lc_up_len;	/* layout length */
 	void			*lc_up_layout;	/* decoded by callback */
-	u32			lc_size_chg;	/* boolean for response */
+	bool			lc_size_chg;	/* response */
 	u64			lc_newsize;	/* response */
 };
 
@@ -508,7 +627,7 @@ struct nfsd4_layoutreturn {
 	u32			lrf_body_len;	/* request */
 	void			*lrf_body;	/* request */
 	stateid_t		lr_sid;		/* request/response */
-	u32			lrs_present;	/* response */
+	bool			lrs_present;	/* response */
 };
 
 struct nfsd4_fallocate {
@@ -626,8 +745,7 @@ struct nfsd4_copy_notify {
 
 	/* response */
 	stateid_t		cpn_cnr_stateid;
-	u64			cpn_sec;
-	u32			cpn_nsec;
+	struct timespec64	cpn_lease_time;
 	struct nl4_server	*cpn_src;
 };
 
@@ -820,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, union nfsd4_op_u *u);
 extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
+extern void nfsd4_lock_release(union nfsd4_op_u *u);
 extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
+extern void nfsd4_lockt_release(union nfsd4_op_u *u);
 extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
 		union nfsd4_op_u *u);
 extern __be32
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bce734b68f08..de2073c47651 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 	nilfs_set_de_type(de, inode);
 	nilfs_commit_chunk(page, mapping, from, to);
 	nilfs_put_page(page);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 }
 
 /*
@@ -519,7 +519,7 @@ got_it:
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
 	nilfs_commit_chunk(page, page->mapping, from, to);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	nilfs_mark_inode_dirty(dir);
 	/* OFFSET_CACHE */
 out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 		pde->rec_len = nilfs_rec_len_to_disk(to - from);
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 out:
 	nilfs_put_page(page);
 	return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1a8bd5993476..f861f3a0bf5c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 	atomic64_inc(&root->inodes_count);
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = ino;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode,
 	i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 	inode->i_size = le64_to_cpu(raw_inode->i_size);
-	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime),
+			le32_to_cpu(raw_inode->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
 			le32_to_cpu(raw_inode->i_ctime_nsec));
-	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime),
+			le32_to_cpu(raw_inode->i_mtime_nsec));
 	if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
 		return -EIO; /* this inode is for metadata and corrupted */
 	if (inode->i_nlink == 0)
@@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
 	raw_inode->i_size = cpu_to_le64(inode->i_size);
-	raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
 
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
 
 	nilfs_truncate_bmap(ii, blkoff);
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 19c8158605ed..c97c77a39668 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
  */
 int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 {
-	pgoff_t index = (pgoff_t)block >>
-		(PAGE_SHIFT - inode->i_blkbits);
-	struct page *page;
-	unsigned long first_block;
+	pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits);
+	struct folio *folio;
+	struct buffer_head *bh;
 	int ret = 0;
 	int still_dirty;
 
-	page = find_lock_page(inode->i_mapping, index);
-	if (!page)
+	folio = filemap_lock_folio(inode->i_mapping, index);
+	if (IS_ERR(folio))
 		return -ENOENT;
 
-	wait_on_page_writeback(page);
-
-	first_block = (unsigned long)index <<
-		(PAGE_SHIFT - inode->i_blkbits);
-	if (page_has_buffers(page)) {
-		struct buffer_head *bh;
+	folio_wait_writeback(folio);
 
-		bh = nilfs_page_get_nth_block(page, block - first_block);
+	bh = folio_buffers(folio);
+	if (bh) {
+		unsigned long first_block = index <<
+				(PAGE_SHIFT - inode->i_blkbits);
+		bh = get_nth_bh(bh, block - first_block);
 		nilfs_forget_buffer(bh);
 	}
-	still_dirty = PageDirty(page);
-	unlock_page(page);
-	put_page(page);
+	still_dirty = folio_test_dirty(folio);
+	folio_unlock(folio);
+	folio_put(folio);
 
 	if (still_dirty ||
 	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -560,17 +558,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 {
 	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
 	struct buffer_head *bh_frozen;
-	struct page *page;
+	struct folio *folio;
 	int blkbits = inode->i_blkbits;
 
-	page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index);
-	if (!page)
-		return -ENOMEM;
+	folio = filemap_grab_folio(shadow->inode->i_mapping,
+			bh->b_folio->index);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << blkbits, 0);
+	bh_frozen = folio_buffers(folio);
+	if (!bh_frozen)
+		bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0);
 
-	bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits);
+	bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits);
 
 	if (!buffer_uptodate(bh_frozen))
 		nilfs_copy_buffer(bh_frozen, bh);
@@ -582,8 +582,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 		brelse(bh_frozen); /* already frozen */
 	}
 
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	return 0;
 }
 
@@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
 {
 	struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow;
 	struct buffer_head *bh_frozen = NULL;
-	struct page *page;
+	struct folio *folio;
 	int n;
 
-	page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index);
-	if (page) {
-		if (page_has_buffers(page)) {
+	folio = filemap_lock_folio(shadow->inode->i_mapping,
+			bh->b_folio->index);
+	if (!IS_ERR(folio)) {
+		bh_frozen = folio_buffers(folio);
+		if (bh_frozen) {
 			n = bh_offset(bh) >> inode->i_blkbits;
-			bh_frozen = nilfs_page_get_nth_block(page, n);
+			bh_frozen = get_nth_bh(bh_frozen, n);
 		}
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 	}
 	return bh_frozen;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b4e54d079b7d..06b04758f289 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -25,19 +25,19 @@
 	(BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) |	\
 	 BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
 
-static struct buffer_head *
-__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
-		       int blkbits, unsigned long b_state)
+static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
+		unsigned long block, pgoff_t index, int blkbits,
+		unsigned long b_state)
 
 {
 	unsigned long first_block;
-	struct buffer_head *bh;
+	struct buffer_head *bh = folio_buffers(folio);
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << blkbits, b_state);
+	if (!bh)
+		bh = create_empty_buffers(folio, 1 << blkbits, b_state);
 
 	first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
-	bh = nilfs_page_get_nth_block(page, block - first_block);
+	bh = get_nth_bh(bh, block - first_block);
 
 	touch_buffer(bh);
 	wait_on_buffer(bh);
@@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 {
 	int blkbits = inode->i_blkbits;
 	pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
-	struct page *page;
+	struct folio *folio;
 	struct buffer_head *bh;
 
-	page = grab_cache_page(mapping, index);
-	if (unlikely(!page))
+	folio = filemap_grab_folio(mapping, index);
+	if (IS_ERR(folio))
 		return NULL;
 
-	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+	bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state);
 	if (unlikely(!bh)) {
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		return NULL;
 	}
 	return bh;
@@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page)
 }
 
 /**
- * nilfs_copy_page -- copy the page with buffers
- * @dst: destination page
- * @src: source page
- * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ * nilfs_copy_folio -- copy the folio with buffers
+ * @dst: destination folio
+ * @src: source folio
+ * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads.
  *
- * This function is for both data pages and btnode pages.  The dirty flag
- * should be treated by caller.  The page must not be under i/o.
- * Both src and dst page must be locked
+ * This function is for both data folios and btnode folios.  The dirty flag
+ * should be treated by caller.  The folio must not be under i/o.
+ * Both src and dst folio must be locked
  */
-static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+static void nilfs_copy_folio(struct folio *dst, struct folio *src,
+		bool copy_dirty)
 {
 	struct buffer_head *dbh, *dbufs, *sbh;
 	unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
 
-	BUG_ON(PageWriteback(dst));
+	BUG_ON(folio_test_writeback(dst));
 
-	sbh = page_buffers(src);
-	if (!page_has_buffers(dst))
-		create_empty_buffers(dst, sbh->b_size, 0);
+	sbh = folio_buffers(src);
+	dbh = folio_buffers(dst);
+	if (!dbh)
+		dbh = create_empty_buffers(dst, sbh->b_size, 0);
 
 	if (copy_dirty)
 		mask |= BIT(BH_Dirty);
 
-	dbh = dbufs = page_buffers(dst);
+	dbufs = dbh;
 	do {
 		lock_buffer(sbh);
 		lock_buffer(dbh);
@@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
 		dbh = dbh->b_this_page;
 	} while (dbh != dbufs);
 
-	copy_highpage(dst, src);
+	folio_copy(dst, src);
 
-	if (PageUptodate(src) && !PageUptodate(dst))
-		SetPageUptodate(dst);
-	else if (!PageUptodate(src) && PageUptodate(dst))
-		ClearPageUptodate(dst);
-	if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
-		SetPageMappedToDisk(dst);
-	else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
-		ClearPageMappedToDisk(dst);
+	if (folio_test_uptodate(src) && !folio_test_uptodate(dst))
+		folio_mark_uptodate(dst);
+	else if (!folio_test_uptodate(src) && folio_test_uptodate(dst))
+		folio_clear_uptodate(dst);
+	if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst))
+		folio_set_mappedtodisk(dst);
+	else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst))
+		folio_clear_mappedtodisk(dst);
 
 	do {
 		unlock_buffer(sbh);
@@ -269,7 +271,7 @@ repeat:
 			NILFS_PAGE_BUG(&folio->page,
 				       "found empty page in dat page cache");
 
-		nilfs_copy_page(&dfolio->page, &folio->page, 1);
+		nilfs_copy_folio(dfolio, folio, true);
 		filemap_dirty_folio(folio_mapping(dfolio), dfolio);
 
 		folio_unlock(dfolio);
@@ -314,7 +316,7 @@ repeat:
 		if (!IS_ERR(dfolio)) {
 			/* overwrite existing folio in the destination cache */
 			WARN_ON(folio_test_dirty(dfolio));
-			nilfs_copy_page(&dfolio->page, &folio->page, 0);
+			nilfs_copy_folio(dfolio, folio, false);
 			folio_unlock(dfolio);
 			folio_put(dfolio);
 			/* Do we not need to remove folio from smap here? */
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 21ddcdd4d63e..d249ea1cefff 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -52,15 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 #define NILFS_PAGE_BUG(page, m, a...) \
 	do { nilfs_page_bug(page); BUG(); } while (0)
 
-static inline struct buffer_head *
-nilfs_page_get_nth_block(struct page *page, unsigned int count)
-{
-	struct buffer_head *bh = page_buffers(page);
-
-	while (count-- > 0)
-		bh = bh->b_this_page;
-	get_bh(bh);
-	return bh;
-}
-
 #endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ec16879756e..55e31cc903d1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 			continue;
 		}
 		head = folio_buffers(folio);
-		if (!head) {
-			create_empty_buffers(&folio->page, i_blocksize(inode), 0);
-			head = folio_buffers(folio);
-		}
+		if (!head)
+			head = create_empty_buffers(folio,
+					i_blocksize(inode), 0);
 		folio_unlock(folio);
 
 		bh = head;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2c6078a6b8ec..58ca7c936393 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -501,15 +501,38 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 
 	down_write(&NILFS_MDT(sufile)->mi_sem);
 	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
-	if (!ret) {
-		mark_buffer_dirty(bh);
-		nilfs_mdt_mark_dirty(sufile);
-		kaddr = kmap_atomic(bh->b_page);
-		su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	if (ret)
+		goto out_sem;
+
+	kaddr = kmap_atomic(bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	if (unlikely(nilfs_segment_usage_error(su))) {
+		struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+
+		kunmap_atomic(kaddr);
+		brelse(bh);
+		if (nilfs_segment_is_active(nilfs, segnum)) {
+			nilfs_error(sufile->i_sb,
+				    "active segment %llu is erroneous",
+				    (unsigned long long)segnum);
+		} else {
+			/*
+			 * Segments marked erroneous are never allocated by
+			 * nilfs_sufile_alloc(); only active segments, ie,
+			 * the segments indexed by ns_segnum or ns_nextnum,
+			 * can be erroneous here.
+			 */
+			WARN_ON_ONCE(1);
+		}
+		ret = -EIO;
+	} else {
 		nilfs_segment_usage_set_dirty(su);
 		kunmap_atomic(kaddr);
+		mark_buffer_dirty(bh);
+		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
 	}
+out_sem:
 	up_write(&NILFS_MDT(sufile)->mi_sem);
 	return ret;
 }
@@ -536,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 
 	kaddr = kmap_atomic(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
-	WARN_ON(nilfs_segment_usage_error(su));
-	if (modtime)
+	if (modtime) {
+		/*
+		 * Check segusage error and set su_lastmod only when updating
+		 * this entry with a valid timestamp, not for cancellation.
+		 */
+		WARN_ON_ONCE(nilfs_segment_usage_error(su));
 		su->su_lastmod = cpu_to_le64(modtime);
+	}
 	su->su_nblocks = cpu_to_le32(nblocks);
 	kunmap_atomic(kaddr);
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 0f0667957c81..71400496ed36 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -716,7 +716,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 			goto failed_sbh;
 		}
 		nilfs_release_super_block(nilfs);
-		sb_set_blocksize(sb, blocksize);
+		if (!sb_set_blocksize(sb, blocksize)) {
+			nilfs_err(sb, "bad blocksize %d", blocksize);
+			err = -EINVAL;
+			goto out;
+		}
 
 		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
 		if (err)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..1cb9ad7e884e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void)
 #define dnotify_sysctl_init() do { } while (0)
 #endif
 
-static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_cache __read_mostly;
-static struct fsnotify_group *dnotify_group __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+static struct fsnotify_group *dnotify_group __ro_after_init;
 
 /*
  * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
-	struct file *f;
+	struct file *f = NULL;
 	int destroy = 0, error = 0;
 	__u32 mask;
 
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	}
 
 	rcu_read_lock();
-	f = lookup_fd_rcu(fd);
+	f = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
 		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
+	if (f)
+		fput(f);
 	return error;
 }
 
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index e8a3c28c5d12..6936671e148d 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -275,9 +275,9 @@ static inline void fanotify_init_event(struct fanotify_event *event,
 
 #define FANOTIFY_INLINE_FH(name, size)					\
 struct {								\
-	struct fanotify_fh (name);					\
+	struct fanotify_fh name;					\
 	/* Space for object_fh.buf[] - access with fanotify_fh_buf() */	\
-	unsigned char _inline_fh_buf[(size)];				\
+	unsigned char _inline_fh_buf[size];				\
 }
 
 struct fanotify_fid_event {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 62fe0b679e58..4d765c72496f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void)
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
-struct kmem_cache *fanotify_mark_cache __read_mostly;
-struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
-struct kmem_cache *fanotify_path_event_cachep __read_mostly;
-struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+struct kmem_cache *fanotify_mark_cache __ro_after_init;
+struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
 
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
@@ -1595,7 +1595,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
 	 * file handles so user can use name_to_handle_at() to compare fids
 	 * reported with events to the file handle of watched objects.
 	 */
-	if (!nop)
+	if (!exportfs_can_encode_fid(nop))
 		return -EOPNOTSUPP;
 
 	/*
@@ -1603,7 +1603,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
 	 * supports decoding file handles, so user has a way to map back the
 	 * reported fids to filesystem objects.
 	 */
-	if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
+	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
 		return -EOPNOTSUPP;
 
 	return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1c4bfdab008d..a3809ae92170 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -49,7 +49,7 @@
 /* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
 
-struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
 
 #ifdef CONFIG_SYSCTL
 
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 647a22433bd8..9a4b228d42fa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
 		return -ENOMEM;
 	}
 	inode->i_ino = ns->inum;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_flags |= S_IMMUTABLE;
 	inode->i_mode = S_IFREG | S_IRUGO;
 	inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 4e158bce4192..71e31e789b29 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -145,13 +145,12 @@ still_busy:
 }
 
 /**
- * ntfs_read_block - fill a @page of an address space with data
- * @page:	page cache page to fill with data
+ * ntfs_read_block - fill a @folio of an address space with data
+ * @folio:	page cache folio to fill with data
  *
- * Fill the page @page of the address space belonging to the @page->host inode.
  * We read each buffer asynchronously and when all buffers are read in, our io
  * completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the page before finally marking it uptodate and
+ * applies the mst fixups to the folio before finally marking it uptodate and
  * unlocking it.
  *
  * We only enforce allocated_size limit because i_size is checked for in
@@ -161,7 +160,7 @@ still_busy:
  *
  * Contains an adapted version of fs/buffer.c::block_read_full_folio().
  */
-static int ntfs_read_block(struct page *page)
+static int ntfs_read_block(struct folio *folio)
 {
 	loff_t i_size;
 	VCN vcn;
@@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page)
 	int i, nr;
 	unsigned char blocksize_bits;
 
-	vi = page->mapping->host;
+	vi = folio->mapping->host;
 	ni = NTFS_I(vi);
 	vol = ni->vol;
 
@@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page)
 	blocksize = vol->sb->s_blocksize;
 	blocksize_bits = vol->sb->s_blocksize_bits;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize, 0);
-		if (unlikely(!page_has_buffers(page))) {
-			unlock_page(page);
-			return -ENOMEM;
-		}
-	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
+	head = folio_buffers(folio);
+	if (!head)
+		head = create_empty_buffers(folio, blocksize, 0);
+	bh = head;
 
 	/*
 	 * We may be racing with truncate.  To avoid some of the problems we
@@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page)
 	 * may leave some buffers unmapped which are now allocated.  This is
 	 * not a problem since these buffers will just get mapped when a write
 	 * occurs.  In case of a shrinking truncate, we will detect this later
-	 * on due to the runlist being incomplete and if the page is being
+	 * on due to the runlist being incomplete and if the folio is being
 	 * fully truncated, truncate will throw it away as soon as we unlock
 	 * it so no need to worry what we do with it.
 	 */
-	iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+	iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
 	read_lock_irqsave(&ni->size_lock, flags);
 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
 	init_size = ni->initialized_size;
@@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page)
 	}
 	zblock = (init_size + blocksize - 1) >> blocksize_bits;
 
-	/* Loop through all the buffers in the page. */
+	/* Loop through all the buffers in the folio. */
 	rl = NULL;
 	nr = i = 0;
 	do {
@@ -299,7 +293,7 @@ lock_retry_remap:
 			if (!err)
 				err = -EIO;
 			bh->b_blocknr = -1;
-			SetPageError(page);
+			folio_set_error(folio);
 			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
 					"attribute type 0x%x, vcn 0x%llx, "
 					"offset 0x%x because its location on "
@@ -312,13 +306,13 @@ lock_retry_remap:
 		/*
 		 * Either iblock was outside lblock limits or
 		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
-		 * of the page and set the buffer uptodate.
+		 * of the folio and set the buffer uptodate.
 		 */
 handle_hole:
 		bh->b_blocknr = -1UL;
 		clear_buffer_mapped(bh);
 handle_zblock:
-		zero_user(page, i * blocksize, blocksize);
+		folio_zero_range(folio, i * blocksize, blocksize);
 		if (likely(!err))
 			set_buffer_uptodate(bh);
 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
@@ -349,11 +343,11 @@ handle_zblock:
 		return 0;
 	}
 	/* No i/o was scheduled on any of the buffers. */
-	if (likely(!PageError(page)))
-		SetPageUptodate(page);
+	if (likely(!folio_test_error(folio)))
+		folio_mark_uptodate(folio);
 	else /* Signal synchronous i/o error. */
 		nr = -EIO;
-	unlock_page(page);
+	folio_unlock(folio);
 	return nr;
 }
 
@@ -433,7 +427,7 @@ retry_readpage:
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
 		/* Normal, non-resident data stream. */
-		return ntfs_read_block(page);
+		return ntfs_read_block(folio);
 	}
 	/*
 	 * Attribute is resident, implying it is not compressed or encrypted.
@@ -507,28 +501,29 @@ err_out:
 #ifdef NTFS_RW
 
 /**
- * ntfs_write_block - write a @page to the backing store
- * @page:	page cache page to write out
+ * ntfs_write_block - write a @folio to the backing store
+ * @folio:	page cache folio to write out
  * @wbc:	writeback control structure
  *
- * This function is for writing pages belonging to non-resident, non-mst
+ * This function is for writing folios belonging to non-resident, non-mst
  * protected attributes to their backing store.
  *
- * For a page with buffers, map and write the dirty buffers asynchronously
- * under page writeback. For a page without buffers, create buffers for the
- * page, then proceed as above.
+ * For a folio with buffers, map and write the dirty buffers asynchronously
+ * under folio writeback. For a folio without buffers, create buffers for the
+ * folio, then proceed as above.
  *
- * If a page doesn't have buffers the page dirty state is definitive. If a page
- * does have buffers, the page dirty state is just a hint, and the buffer dirty
- * state is definitive. (A hint which has rules: dirty buffers against a clean
- * page is illegal. Other combinations are legal and need to be handled. In
- * particular a dirty page containing clean buffers for example.)
+ * If a folio doesn't have buffers the folio dirty state is definitive. If
+ * a folio does have buffers, the folio dirty state is just a hint,
+ * and the buffer dirty state is definitive. (A hint which has rules:
+ * dirty buffers against a clean folio is illegal. Other combinations are
+ * legal and need to be handled. In particular a dirty folio containing
+ * clean buffers for example.)
  *
  * Return 0 on success and -errno on error.
  *
  * Based on ntfs_read_block() and __block_write_full_folio().
  */
-static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
 {
 	VCN vcn;
 	LCN lcn;
@@ -546,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	bool need_end_writeback;
 	unsigned char blocksize_bits;
 
-	vi = page->mapping->host;
+	vi = folio->mapping->host;
 	ni = NTFS_I(vi);
 	vol = ni->vol;
 
 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx.", ni->mft_no, ni->type, page->index);
+			"0x%lx.", ni->mft_no, ni->type, folio->index);
 
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(NInoMstProtected(ni));
 	blocksize = vol->sb->s_blocksize;
 	blocksize_bits = vol->sb->s_blocksize_bits;
-	if (!page_has_buffers(page)) {
-		BUG_ON(!PageUptodate(page));
-		create_empty_buffers(page, blocksize,
+	head = folio_buffers(folio);
+	if (!head) {
+		BUG_ON(!folio_test_uptodate(folio));
+		head = create_empty_buffers(folio, blocksize,
 				(1 << BH_Uptodate) | (1 << BH_Dirty));
-		if (unlikely(!page_has_buffers(page))) {
-			ntfs_warning(vol->sb, "Error allocating page "
-					"buffers.  Redirtying page so we try "
-					"again later.");
-			/*
-			 * Put the page back on mapping->dirty_pages, but leave
-			 * its buffers' dirty state as-is.
-			 */
-			redirty_page_for_writepage(wbc, page);
-			unlock_page(page);
-			return 0;
-		}
 	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
+	bh = head;
 
 	/* NOTE: Different naming scheme to ntfs_read_block()! */
 
-	/* The first block in the page. */
-	block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+	/* The first block in the folio. */
+	block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
 
 	read_lock_irqsave(&ni->size_lock, flags);
 	i_size = i_size_read(vi);
@@ -597,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	 * Be very careful.  We have no exclusion from block_dirty_folio
 	 * here, and the (potentially unmapped) buffers may become dirty at
 	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the page stays dirty.
+	 * then we just miss that fact, and the folio stays dirty.
 	 *
 	 * Buffers outside i_size may be dirtied by block_dirty_folio;
 	 * handle that here by just cleaning them.
 	 */
 
 	/*
-	 * Loop through all the buffers in the page, mapping all the dirty
+	 * Loop through all the buffers in the folio, mapping all the dirty
 	 * buffers to disk addresses and handling any aliases from the
 	 * underlying block device's mapping.
 	 */
@@ -616,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 		if (unlikely(block >= dblock)) {
 			/*
 			 * Mapped buffers outside i_size will occur, because
-			 * this page can be outside i_size when there is a
+			 * this folio can be outside i_size when there is a
 			 * truncate in progress. The contents of such buffers
 			 * were zeroed by ntfs_writepage().
 			 *
 			 * FIXME: What about the small race window where
 			 * ntfs_writepage() has not done any clearing because
-			 * the page was within i_size but before we get here,
+			 * the folio was within i_size but before we get here,
 			 * vmtruncate() modifies i_size?
 			 */
 			clear_buffer_dirty(bh);
@@ -638,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 		if (unlikely((block >= iblock) &&
 				(initialized_size < i_size))) {
 			/*
-			 * If this page is fully outside initialized
-			 * size, zero out all pages between the current
-			 * initialized size and the current page. Just
+			 * If this folio is fully outside initialized
+			 * size, zero out all folios between the current
+			 * initialized size and the current folio. Just
 			 * use ntfs_read_folio() to do the zeroing
 			 * transparently.
 			 */
 			if (block > iblock) {
 				// TODO:
-				// For each page do:
-				// - read_cache_page()
-				// Again for each page do:
-				// - wait_on_page_locked()
-				// - Check (PageUptodate(page) &&
-				//			!PageError(page))
+				// For each folio do:
+				// - read_cache_folio()
+				// Again for each folio do:
+				// - wait_on_folio_locked()
+				// - Check (folio_test_uptodate(folio) &&
+				//		!folio_test_error(folio))
 				// Update initialized size in the attribute and
 				// in the inode.
-				// Again, for each page do:
+				// Again, for each folio do:
 				//	block_dirty_folio();
-				// put_page()
+				// folio_put()
 				// We don't need to wait on the writes.
 				// Update iblock.
 			}
 			/*
-			 * The current page straddles initialized size. Zero
+			 * The current folio straddles initialized size. Zero
 			 * all non-uptodate buffers and set them uptodate (and
 			 * dirty?). Note, there aren't any non-uptodate buffers
-			 * if the page is uptodate.
-			 * FIXME: For an uptodate page, the buffers may need to
+			 * if the folio is uptodate.
+			 * FIXME: For an uptodate folio, the buffers may need to
 			 * be written out because they were not initialized on
 			 * disk before.
 			 */
-			if (!PageUptodate(page)) {
+			if (!folio_test_uptodate(folio)) {
 				// TODO:
 				// Zero any non-uptodate buffers up to i_size.
 				// Set them uptodate and dirty.
@@ -727,14 +710,14 @@ lock_retry_remap:
 			unsigned long *bpos, *bend;
 
 			/* Check if the buffer is zero. */
-			kaddr = kmap_atomic(page);
-			bpos = (unsigned long *)(kaddr + bh_offset(bh));
-			bend = (unsigned long *)((u8*)bpos + blocksize);
+			kaddr = kmap_local_folio(folio, bh_offset(bh));
+			bpos = (unsigned long *)kaddr;
+			bend = (unsigned long *)(kaddr + blocksize);
 			do {
 				if (unlikely(*bpos))
 					break;
 			} while (likely(++bpos < bend));
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			if (bpos == bend) {
 				/*
 				 * Buffer is zero and sparse, no need to write
@@ -774,7 +757,7 @@ lock_retry_remap:
 		if (err == -ENOENT || lcn == LCN_ENOENT) {
 			bh->b_blocknr = -1;
 			clear_buffer_dirty(bh);
-			zero_user(page, bh_offset(bh), blocksize);
+			folio_zero_range(folio, bh_offset(bh), blocksize);
 			set_buffer_uptodate(bh);
 			err = 0;
 			continue;
@@ -801,7 +784,7 @@ lock_retry_remap:
 	bh = head;
 
 	/* Just an optimization, so ->read_folio() is not called later. */
-	if (unlikely(!PageUptodate(page))) {
+	if (unlikely(!folio_test_uptodate(folio))) {
 		int uptodate = 1;
 		do {
 			if (!buffer_uptodate(bh)) {
@@ -811,7 +794,7 @@ lock_retry_remap:
 			}
 		} while ((bh = bh->b_this_page) != head);
 		if (uptodate)
-			SetPageUptodate(page);
+			folio_mark_uptodate(folio);
 	}
 
 	/* Setup all mapped, dirty buffers for async write i/o. */
@@ -826,7 +809,7 @@ lock_retry_remap:
 		} else if (unlikely(err)) {
 			/*
 			 * For the error case. The buffer may have been set
-			 * dirty during attachment to a dirty page.
+			 * dirty during attachment to a dirty folio.
 			 */
 			if (err != -ENOMEM)
 				clear_buffer_dirty(bh);
@@ -839,20 +822,20 @@ lock_retry_remap:
 			err = 0;
 		else if (err == -ENOMEM) {
 			ntfs_warning(vol->sb, "Error allocating memory. "
-					"Redirtying page so we try again "
+					"Redirtying folio so we try again "
 					"later.");
 			/*
-			 * Put the page back on mapping->dirty_pages, but
+			 * Put the folio back on mapping->dirty_pages, but
 			 * leave its buffer's dirty state as-is.
 			 */
-			redirty_page_for_writepage(wbc, page);
+			folio_redirty_for_writepage(wbc, folio);
 			err = 0;
 		} else
-			SetPageError(page);
+			folio_set_error(folio);
 	}
 
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);	/* Keeps try_to_free_buffers() away. */
 
 	/* Submit the prepared buffers for i/o. */
 	need_end_writeback = true;
@@ -864,11 +847,11 @@ lock_retry_remap:
 		}
 		bh = next;
 	} while (bh != head);
-	unlock_page(page);
+	folio_unlock(folio);
 
-	/* If no i/o was started, need to end_page_writeback(). */
+	/* If no i/o was started, need to end writeback here. */
 	if (unlikely(need_end_writeback))
-		end_page_writeback(page);
+		folio_end_writeback(folio);
 
 	ntfs_debug("Done.");
 	return err;
@@ -1337,8 +1320,9 @@ done:
  */
 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 {
+	struct folio *folio = page_folio(page);
 	loff_t i_size;
-	struct inode *vi = page->mapping->host;
+	struct inode *vi = folio->mapping->host;
 	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
 	char *addr;
 	ntfs_attr_search_ctx *ctx = NULL;
@@ -1347,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	int err;
 
 retry_writepage:
-	BUG_ON(!PageLocked(page));
+	BUG_ON(!folio_test_locked(folio));
 	i_size = i_size_read(vi);
-	/* Is the page fully outside i_size? (truncate in progress) */
-	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+	/* Is the folio fully outside i_size? (truncate in progress) */
+	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
 			PAGE_SHIFT)) {
-		struct folio *folio = page_folio(page);
 		/*
-		 * The page may have dirty, unmapped buffers.  Make them
+		 * The folio may have dirty, unmapped buffers.  Make them
 		 * freeable here, so the page does not leak.
 		 */
 		block_invalidate_folio(folio, 0, folio_size(folio));
@@ -1373,7 +1356,7 @@ retry_writepage:
 	if (ni->type != AT_INDEX_ALLOCATION) {
 		/* If file is encrypted, deny access, just like NT4. */
 		if (NInoEncrypted(ni)) {
-			unlock_page(page);
+			folio_unlock(folio);
 			BUG_ON(ni->type != AT_DATA);
 			ntfs_debug("Denying write access to encrypted file.");
 			return -EACCES;
@@ -1384,14 +1367,14 @@ retry_writepage:
 			BUG_ON(ni->name_len);
 			// TODO: Implement and replace this with
 			// return ntfs_write_compressed_block(page);
-			unlock_page(page);
+			folio_unlock(folio);
 			ntfs_error(vi->i_sb, "Writing to compressed files is "
 					"not supported yet.  Sorry.");
 			return -EOPNOTSUPP;
 		}
 		// TODO: Implement and remove this check.
 		if (NInoNonResident(ni) && NInoSparse(ni)) {
-			unlock_page(page);
+			folio_unlock(folio);
 			ntfs_error(vi->i_sb, "Writing to sparse files is not "
 					"supported yet.  Sorry.");
 			return -EOPNOTSUPP;
@@ -1400,34 +1383,34 @@ retry_writepage:
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
 		/* We have to zero every time due to mmap-at-end-of-file. */
-		if (page->index >= (i_size >> PAGE_SHIFT)) {
-			/* The page straddles i_size. */
-			unsigned int ofs = i_size & ~PAGE_MASK;
-			zero_user_segment(page, ofs, PAGE_SIZE);
+		if (folio->index >= (i_size >> PAGE_SHIFT)) {
+			/* The folio straddles i_size. */
+			unsigned int ofs = i_size & (folio_size(folio) - 1);
+			folio_zero_segment(folio, ofs, folio_size(folio));
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
 			return ntfs_write_mst_block(page, wbc);
 		/* Normal, non-resident data stream. */
-		return ntfs_write_block(page, wbc);
+		return ntfs_write_block(folio, wbc);
 	}
 	/*
 	 * Attribute is resident, implying it is not compressed, encrypted, or
 	 * mst protected.  This also means the attribute is smaller than an mft
-	 * record and hence smaller than a page, so can simply return error on
-	 * any pages with index above 0.  Note the attribute can actually be
+	 * record and hence smaller than a folio, so can simply return error on
+	 * any folios with index above 0.  Note the attribute can actually be
 	 * marked compressed but if it is resident the actual data is not
 	 * compressed so we are ok to ignore the compressed flag here.
 	 */
-	BUG_ON(page_has_buffers(page));
-	BUG_ON(!PageUptodate(page));
-	if (unlikely(page->index > 0)) {
-		ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
-				"Aborting write.", page->index);
-		BUG_ON(PageWriteback(page));
-		set_page_writeback(page);
-		unlock_page(page);
-		end_page_writeback(page);
+	BUG_ON(folio_buffers(folio));
+	BUG_ON(!folio_test_uptodate(folio));
+	if (unlikely(folio->index > 0)) {
+		ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0.  "
+				"Aborting write.", folio->index);
+		BUG_ON(folio_test_writeback(folio));
+		folio_start_writeback(folio);
+		folio_unlock(folio);
+		folio_end_writeback(folio);
 		return -EIO;
 	}
 	if (!NInoAttr(ni))
@@ -1460,12 +1443,12 @@ retry_writepage:
 	if (unlikely(err))
 		goto err_out;
 	/*
-	 * Keep the VM happy.  This must be done otherwise the radix-tree tag
-	 * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+	 * Keep the VM happy.  This must be done otherwise
+	 * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
 	 */
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	unlock_page(page);
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+	folio_unlock(folio);
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 	i_size = i_size_read(vi);
 	if (unlikely(attr_len > i_size)) {
@@ -1480,18 +1463,18 @@ retry_writepage:
 		/* Shrinking cannot fail. */
 		BUG_ON(err);
 	}
-	addr = kmap_atomic(page);
-	/* Copy the data from the page to the mft record. */
+	addr = kmap_local_folio(folio, 0);
+	/* Copy the data from the folio to the mft record. */
 	memcpy((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			addr, attr_len);
-	/* Zero out of bounds area in the page cache page. */
-	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
-	kunmap_atomic(addr);
-	flush_dcache_page(page);
+	/* Zero out of bounds area in the page cache folio. */
+	memset(addr + attr_len, 0, folio_size(folio) - attr_len);
+	kunmap_local(addr);
+	flush_dcache_folio(folio);
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	/* We are done with the page. */
-	end_page_writeback(page);
+	/* We are done with the folio. */
+	folio_end_writeback(folio);
 	/* Finally, mark the mft record dirty, so it gets written back. */
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
@@ -1502,18 +1485,18 @@ err_out:
 		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
 				"page so we try again later.");
 		/*
-		 * Put the page back on mapping->dirty_pages, but leave its
+		 * Put the folio back on mapping->dirty_pages, but leave its
 		 * buffers' dirty state as-is.
 		 */
-		redirty_page_for_writepage(wbc, page);
+		folio_redirty_for_writepage(wbc, folio);
 		err = 0;
 	} else {
 		ntfs_error(vi->i_sb, "Resident attribute write failed with "
 				"error %i.", err);
-		SetPageError(page);
+		folio_set_error(folio);
 		NVolSetErrors(ni->vol);
 	}
-	unlock_page(page);
+	folio_unlock(folio);
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
 	if (m)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index cbc545999cfe..297c0b9db621 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 	LCN lcn;
 	s64 bh_pos, vcn_len, end, initialized_size;
 	sector_t lcn_block;
-	struct page *page;
+	struct folio *folio;
 	struct inode *vi;
 	ntfs_inode *ni, *base_ni = NULL;
 	ntfs_volume *vol;
@@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 			(long long)pos, bytes);
 	blocksize = vol->sb->s_blocksize;
 	blocksize_bits = vol->sb->s_blocksize_bits;
-	u = 0;
-	do {
-		page = pages[u];
-		BUG_ON(!page);
-		/*
-		 * create_empty_buffers() will create uptodate/dirty buffers if
-		 * the page is uptodate/dirty.
-		 */
-		if (!page_has_buffers(page)) {
-			create_empty_buffers(page, blocksize, 0);
-			if (unlikely(!page_has_buffers(page)))
-				return -ENOMEM;
-		}
-	} while (++u < nr_pages);
 	rl_write_locked = false;
 	rl = NULL;
 	err = 0;
@@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 	end = pos + bytes;
 	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
 	/*
-	 * Loop over each page and for each page over each buffer.  Use goto to
+	 * Loop over each buffer in each folio.  Use goto to
 	 * reduce indentation.
 	 */
 	u = 0;
-do_next_page:
-	page = pages[u];
-	bh_pos = (s64)page->index << PAGE_SHIFT;
-	bh = head = page_buffers(page);
+do_next_folio:
+	folio = page_folio(pages[u]);
+	bh_pos = folio_pos(folio);
+	head = folio_buffers(folio);
+	if (!head)
+		/*
+		 * create_empty_buffers() will create uptodate/dirty
+		 * buffers if the folio is uptodate/dirty.
+		 */
+		head = create_empty_buffers(folio, blocksize, 0);
+	bh = head;
 	do {
 		VCN cdelta;
 		s64 bh_end;
@@ -653,15 +646,15 @@ do_next_page:
 			if (buffer_uptodate(bh))
 				continue;
 			/*
-			 * The buffer is not uptodate.  If the page is uptodate
+			 * The buffer is not uptodate.  If the folio is uptodate
 			 * set the buffer uptodate and otherwise ignore it.
 			 */
-			if (PageUptodate(page)) {
+			if (folio_test_uptodate(folio)) {
 				set_buffer_uptodate(bh);
 				continue;
 			}
 			/*
-			 * Neither the page nor the buffer are uptodate.  If
+			 * Neither the folio nor the buffer are uptodate.  If
 			 * the buffer is only partially being written to, we
 			 * need to read it in before the write, i.e. now.
 			 */
@@ -679,7 +672,7 @@ do_next_page:
 					ntfs_submit_bh_for_read(bh);
 					*wait_bh++ = bh;
 				} else {
-					zero_user(page, bh_offset(bh),
+					folio_zero_range(folio, bh_offset(bh),
 							blocksize);
 					set_buffer_uptodate(bh);
 				}
@@ -706,7 +699,7 @@ map_buffer_cached:
 					(bh_cofs >> blocksize_bits);
 			set_buffer_mapped(bh);
 			/*
-			 * If the page is uptodate so is the buffer.  If the
+			 * If the folio is uptodate so is the buffer.  If the
 			 * buffer is fully outside the write, we ignore it if
 			 * it was already allocated and we mark it dirty so it
 			 * gets written out if we allocated it.  On the other
@@ -714,7 +707,7 @@ map_buffer_cached:
 			 * marking it dirty we set buffer_new so we can do
 			 * error recovery.
 			 */
-			if (PageUptodate(page)) {
+			if (folio_test_uptodate(folio)) {
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 				if (unlikely(was_hole)) {
@@ -754,7 +747,8 @@ map_buffer_cached:
 						ntfs_submit_bh_for_read(bh);
 						*wait_bh++ = bh;
 					} else {
-						zero_user(page, bh_offset(bh),
+						folio_zero_range(folio,
+								bh_offset(bh),
 								blocksize);
 						set_buffer_uptodate(bh);
 					}
@@ -773,7 +767,7 @@ map_buffer_cached:
 			 */
 			if (bh_end <= pos || bh_pos >= end) {
 				if (!buffer_uptodate(bh)) {
-					zero_user(page, bh_offset(bh),
+					folio_zero_range(folio, bh_offset(bh),
 							blocksize);
 					set_buffer_uptodate(bh);
 				}
@@ -786,7 +780,7 @@ map_buffer_cached:
 				u8 *kaddr;
 				unsigned pofs;
 					
-				kaddr = kmap_atomic(page);
+				kaddr = kmap_local_folio(folio, 0);
 				if (bh_pos < pos) {
 					pofs = bh_pos & ~PAGE_MASK;
 					memset(kaddr + pofs, 0, pos - bh_pos);
@@ -795,8 +789,8 @@ map_buffer_cached:
 					pofs = end & ~PAGE_MASK;
 					memset(kaddr + pofs, 0, bh_end - end);
 				}
-				kunmap_atomic(kaddr);
-				flush_dcache_page(page);
+				kunmap_local(kaddr);
+				flush_dcache_folio(folio);
 			}
 			continue;
 		}
@@ -809,11 +803,12 @@ map_buffer_cached:
 		initialized_size = ni->allocated_size;
 		read_unlock_irqrestore(&ni->size_lock, flags);
 		if (bh_pos > initialized_size) {
-			if (PageUptodate(page)) {
+			if (folio_test_uptodate(folio)) {
 				if (!buffer_uptodate(bh))
 					set_buffer_uptodate(bh);
 			} else if (!buffer_uptodate(bh)) {
-				zero_user(page, bh_offset(bh), blocksize);
+				folio_zero_range(folio, bh_offset(bh),
+						blocksize);
 				set_buffer_uptodate(bh);
 			}
 			continue;
@@ -927,17 +922,17 @@ rl_not_mapped_enoent:
 				bh->b_blocknr = -1;
 				/*
 				 * If the buffer is uptodate we skip it.  If it
-				 * is not but the page is uptodate, we can set
-				 * the buffer uptodate.  If the page is not
+				 * is not but the folio is uptodate, we can set
+				 * the buffer uptodate.  If the folio is not
 				 * uptodate, we can clear the buffer and set it
 				 * uptodate.  Whether this is worthwhile is
 				 * debatable and this could be removed.
 				 */
-				if (PageUptodate(page)) {
+				if (folio_test_uptodate(folio)) {
 					if (!buffer_uptodate(bh))
 						set_buffer_uptodate(bh);
 				} else if (!buffer_uptodate(bh)) {
-					zero_user(page, bh_offset(bh),
+					folio_zero_range(folio, bh_offset(bh),
 						blocksize);
 					set_buffer_uptodate(bh);
 				}
@@ -1167,7 +1162,7 @@ rl_not_mapped_enoent:
 	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
 	/* If there are no errors, do the next page. */
 	if (likely(!err && ++u < nr_pages))
-		goto do_next_page;
+		goto do_next_folio;
 	/* If there are no errors, release the runlist lock if we took it. */
 	if (likely(!err)) {
 		if (unlikely(rl_write_locked)) {
@@ -1185,9 +1180,8 @@ rl_not_mapped_enoent:
 		bh = *--wait_bh;
 		wait_on_buffer(bh);
 		if (likely(buffer_uptodate(bh))) {
-			page = bh->b_page;
-			bh_pos = ((s64)page->index << PAGE_SHIFT) +
-					bh_offset(bh);
+			folio = bh->b_folio;
+			bh_pos = folio_pos(folio) + bh_offset(bh);
 			/*
 			 * If the buffer overflows the initialized size, need
 			 * to zero the overflowing region.
@@ -1197,7 +1191,7 @@ rl_not_mapped_enoent:
 
 				if (likely(bh_pos < initialized_size))
 					ofs = initialized_size - bh_pos;
-				zero_user_segment(page, bh_offset(bh) + ofs,
+				folio_zero_segment(folio, bh_offset(bh) + ofs,
 						blocksize);
 			}
 		} else /* if (unlikely(!buffer_uptodate(bh))) */
@@ -1324,21 +1318,20 @@ rl_not_mapped_enoent:
 	u = 0;
 	end = bh_cpos << vol->cluster_size_bits;
 	do {
-		page = pages[u];
-		bh = head = page_buffers(page);
+		folio = page_folio(pages[u]);
+		bh = head = folio_buffers(folio);
 		do {
 			if (u == nr_pages &&
-					((s64)page->index << PAGE_SHIFT) +
-					bh_offset(bh) >= end)
+			    folio_pos(folio) + bh_offset(bh) >= end)
 				break;
 			if (!buffer_new(bh))
 				continue;
 			clear_buffer_new(bh);
 			if (!buffer_uptodate(bh)) {
-				if (PageUptodate(page))
+				if (folio_test_uptodate(folio))
 					set_buffer_uptodate(bh);
 				else {
-					zero_user(page, bh_offset(bh),
+					folio_zero_range(folio, bh_offset(bh),
 							blocksize);
 					set_buffer_uptodate(bh);
 				}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 99ac6ea277c4..aba1e22db4e9 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -648,7 +648,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * mtime is the last change of the data within the file. Not changed
 	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
 	 */
-	vi->i_mtime = ntfs2utc(si->last_data_change_time);
+	inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
 	/*
 	 * ctime is the last change of the metadata of the file. This obviously
 	 * always changes, when mtime is changed. ctime can be changed on its
@@ -659,7 +659,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * Last access to the data within the file. Not changed during a rename
 	 * for example but changed whenever the file is written to.
 	 */
-	vi->i_atime = ntfs2utc(si->last_access_time);
+	inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
 
 	/* Find the attribute list attribute if present. */
 	ntfs_attr_reinit_search_ctx(ctx);
@@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
 	set_nlink(vi, base_vi->i_nlink);
-	vi->i_mtime	= base_vi->i_mtime;
+	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
 	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	vi->i_atime	= base_vi->i_atime;
+	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
 	vi->i_generation = ni->seq_no = base_ni->seq_no;
 
 	/* Set inode type to zero but preserve permissions. */
@@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
 	set_nlink(vi, base_vi->i_nlink);
-	vi->i_mtime	= base_vi->i_mtime;
+	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
 	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	vi->i_atime	= base_vi->i_atime;
+	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
 	vi->i_generation = ni->seq_no = base_ni->seq_no;
 	/* Set inode type to zero but preserve permissions. */
 	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
@@ -2805,13 +2805,14 @@ done:
 	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
 		struct timespec64 now = current_time(VFS_I(base_ni));
 		struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
+		struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
 		int sync_it = 0;
 
-		if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+		if (!timespec64_equal(&mtime, &now) ||
 		    !timespec64_equal(&ctime, &now))
 			sync_it = 1;
 		inode_set_ctime_to_ts(VFS_I(base_ni), now);
-		VFS_I(base_ni)->i_mtime = now;
+		inode_set_mtime_to_ts(VFS_I(base_ni), now);
 
 		if (sync_it)
 			mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2925,9 +2926,9 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		}
 	}
 	if (ia_valid & ATTR_ATIME)
-		vi->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(vi, attr->ia_atime);
 	if (ia_valid & ATTR_MTIME)
-		vi->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(vi, attr->ia_mtime);
 	if (ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(vi, attr->ia_ctime);
 	mark_inode_dirty(vi);
@@ -2996,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset));
 	/* Update the access times if they have changed. */
-	nt = utc2ntfs(vi->i_mtime);
+	nt = utc2ntfs(inode_get_mtime(vi));
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino, (long long)
@@ -3014,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
 		si->last_mft_change_time = nt;
 		modified = true;
 	}
-	nt = utc2ntfs(vi->i_atime);
+	nt = utc2ntfs(inode_get_atime(vi));
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index ad1a8f72da22..6fd1dc4b08c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,7 +2682,7 @@ mft_rec_already_initialized:
 			vi->i_mode &= ~S_IWUGO;
 
 		/* Set the inode times to the current time. */
-		vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
+		simple_inode_init_ts(vi);
 		/*
 		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
 		 * the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index ab44f2db533b..d7498ddc4a72 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
  * and due to using iget() whereas NTFS needs ntfs_iget().
  */
 const struct export_operations ntfs_export_ops = {
+	.encode_fh	= generic_encode_ino32_fh,
 	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
 						   directory. */
 	.fh_to_dentry	= ntfs_fh_to_dentry,
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1f7a194983c5..a5a30a24ce5d 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 	struct buffer_head *head, *bh;
 	u32 bh_next, bh_off, to;
 	sector_t iblock;
-	struct page *page;
+	struct folio *folio;
 
 	for (; idx < idx_end; idx += 1, from = 0) {
 		page_off = (loff_t)idx << PAGE_SHIFT;
@@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 						       PAGE_SIZE;
 		iblock = page_off >> inode->i_blkbits;
 
-		page = find_or_create_page(mapping, idx,
-					   mapping_gfp_constraint(mapping,
-								  ~__GFP_FS));
-		if (!page)
-			return -ENOMEM;
+		folio = __filemap_get_folio(mapping, idx,
+				FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+				mapping_gfp_constraint(mapping, ~__GFP_FS));
+		if (IS_ERR(folio))
+			return PTR_ERR(folio);
 
-		if (!page_has_buffers(page))
-			create_empty_buffers(page, blocksize, 0);
+		head = folio_buffers(folio);
+		if (!head)
+			head = create_empty_buffers(folio, blocksize, 0);
 
-		bh = head = page_buffers(page);
+		bh = head;
 		bh_off = 0;
 		do {
 			bh_next = bh_off + blocksize;
@@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 			}
 
 			/* Ok, it's mapped. Make sure it's up-to-date. */
-			if (PageUptodate(page))
+			if (folio_test_uptodate(folio))
 				set_buffer_uptodate(bh);
 
 			if (!buffer_uptodate(bh)) {
 				err = bh_read(bh, 0);
 				if (err < 0) {
-					unlock_page(page);
-					put_page(page);
+					folio_unlock(folio);
+					folio_put(folio);
 					goto out;
 				}
 			}
@@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
 		} while (bh_off = bh_next, iblock += 1,
 			 head != (bh = bh->b_this_page));
 
-		zero_user_segment(page, from, to);
+		folio_zero_segment(folio, from, to);
 
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 		cond_resched();
 	}
 out:
@@ -342,7 +343,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
 		err = 0;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 
 	if (IS_SYNC(inode)) {
@@ -400,7 +401,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
 	ni_unlock(ni);
 
 	ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (!IS_DIRSYNC(inode)) {
 		dirty = 1;
 	} else {
@@ -642,7 +643,7 @@ out:
 		filemap_invalidate_unlock(mapping);
 
 	if (!err) {
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		mark_inode_dirty(inode);
 	}
 
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index dad976a68985..3df2d9e34b91 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3271,7 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 	if (is_rec_inuse(ni->mi.mrec) &&
 	    !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
 		bool modified = false;
-		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 ts;
 
 		/* Update times in standard attribute. */
 		std = ni_std(ni);
@@ -3281,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
 		}
 
 		/* Update the access times if they have changed. */
-		dup.m_time = kernel2nt(&inode->i_mtime);
+		ts = inode_get_mtime(inode);
+		dup.m_time = kernel2nt(&ts);
 		if (std->m_time != dup.m_time) {
 			std->m_time = dup.m_time;
 			modified = true;
 		}
 
-		dup.c_time = kernel2nt(&ctime);
+		ts = inode_get_mtime(inode);
+		dup.c_time = kernel2nt(&ts);
 		if (std->c_time != dup.c_time) {
 			std->c_time = dup.c_time;
 			modified = true;
 		}
 
-		dup.a_time = kernel2nt(&inode->i_atime);
+		ts = inode_get_atime(inode);
+		dup.a_time = kernel2nt(&ts);
 		if (std->a_time != dup.a_time) {
 			std->a_time = dup.a_time;
 			modified = true;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d6d021e19aaa..5e3d71374918 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,7 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
 	u64 t64;
 	struct MFT_REC *rec;
 	struct runs_tree *run;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	inode->i_op = NULL;
 	/* Setup 'uid' and 'gid' */
@@ -169,10 +169,12 @@ next_attr:
 #ifdef STATX_BTIME
 		nt2kernel(std5->cr_time, &ni->i_crtime);
 #endif
-		nt2kernel(std5->a_time, &inode->i_atime);
-		nt2kernel(std5->c_time, &ctime);
-		inode_set_ctime_to_ts(inode, ctime);
-		nt2kernel(std5->m_time, &inode->i_mtime);
+		nt2kernel(std5->a_time, &ts);
+		inode_set_atime_to_ts(inode, ts);
+		nt2kernel(std5->c_time, &ts);
+		inode_set_ctime_to_ts(inode, ts);
+		nt2kernel(std5->m_time, &ts);
+		inode_set_mtime_to_ts(inode, ts);
 
 		ni->std_fa = std5->fa;
 
@@ -960,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
 
 	if (err >= 0) {
 		if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
-			inode->i_mtime = inode_set_ctime_current(inode);
+			inode_set_mtime_to_ts(inode,
+					      inode_set_ctime_current(inode));
 			ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
 			dirty = true;
 		}
@@ -1660,9 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
 	d_instantiate(dentry, inode);
 
 	/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
-	inode->i_atime = inode->i_mtime =
-		inode_set_ctime_to_ts(inode, ni->i_crtime);
-	dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
+	inode_set_atime_to_ts(inode, ni->i_crtime);
+	inode_set_ctime_to_ts(inode, ni->i_crtime);
+	inode_set_mtime_to_ts(inode, ni->i_crtime);
+	inode_set_mtime_to_ts(dir, ni->i_crtime);
+	inode_set_ctime_to_ts(dir, ni->i_crtime);
 
 	mark_inode_dirty(dir);
 	mark_inode_dirty(inode);
@@ -1768,7 +1773,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
 
 	if (!err) {
 		drop_nlink(inode);
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		mark_inode_dirty(dir);
 		inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
 		if (inode->i_nlink)
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index eedacf94edd8..ee3093be5170 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
 	err = ntfs_link_inode(inode, de);
 
 	if (!err) {
-		dir->i_mtime = inode_set_ctime_to_ts(
-			inode, inode_set_ctime_current(dir));
+		inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 		mark_inode_dirty(inode);
 		mark_inode_dirty(dir);
 		d_instantiate(de, inode);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 0e6a2777870c..f6706143d14b 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
 
 int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
 ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler *ntfs_xattr_handlers[];
+extern const struct xattr_handler * const ntfs_xattr_handlers[];
 
 int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
 void ntfs_get_wsl_perm(struct inode *inode);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index f763e3256ccc..9153dffde950 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -811,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode)
 }
 
 static const struct export_operations ntfs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = ntfs_fh_to_dentry,
 	.fh_to_parent = ntfs_fh_to_parent,
 	.get_parent = ntfs3_get_parent,
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4920548192a0..4274b6f31cfa 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -1021,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = {
 	.list	= ntfs_xattr_user_list,
 };
 
-const struct xattr_handler *ntfs_xattr_handlers[] = {
+const struct xattr_handler * const ntfs_xattr_handlers[] = {
 	&ntfs_other_xattr_handler,
 	NULL,
 };
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e75137a8e7cb..62464d194da3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -193,8 +193,8 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
 	inode->i_mode = new_mode;
 	inode_set_ctime_current(inode);
 	di->i_mode = cpu_to_le16(inode->i_mode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index aef58f1395c8..91b32b2377ac 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
 		el = &eb->h_list;
 	}
 
-	BUG_ON(el->l_tree_depth != 0);
+	if (el->l_tree_depth != 0) {
+		retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+				"Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				(unsigned long long)last_eb_blk,
+				le16_to_cpu(el->l_tree_depth));
+		goto bail;
+	}
 
 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 bail:
@@ -7436,10 +7443,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 	}
 
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 
 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	ocfs2_journal_dirty(handle, di_bh);
@@ -7642,7 +7649,7 @@ out_mutex:
 		goto next_group;
 	}
 out:
-	range->len = trimmed * sb->s_blocksize;
+	range->len = trimmed * osb->s_clustersize;
 	return ret;
 }
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0fdba30740ab..ba790219d528 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page,
  * read-in the blocks at the tail of our file. Avoid reading them by
  * testing i_size against each block offset.
  */
-static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio,
 				 unsigned int block_start)
 {
-	u64 offset = page_offset(page) + block_start;
+	u64 offset = folio_pos(folio) + block_start;
 
 	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		return 1;
@@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new)
 {
+	struct folio *folio = page_folio(page);
 	int ret = 0;
 	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 	unsigned int block_end, block_start;
 	unsigned int bsize = i_blocksize(inode);
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, bsize, 0);
+	head = folio_buffers(folio);
+	if (!head)
+		head = create_empty_buffers(folio, bsize, 0);
 
-	head = page_buffers(page);
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	     bh = bh->b_this_page, block_start += bsize) {
 		block_end = block_start + bsize;
@@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		 * they may belong to unallocated clusters.
 		 */
 		if (block_start >= to || block_end <= from) {
-			if (PageUptodate(page))
+			if (folio_test_uptodate(folio))
 				set_buffer_uptodate(bh);
 			continue;
 		}
@@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			clean_bdev_bh_alias(bh);
 		}
 
-		if (PageUptodate(page)) {
+		if (folio_test_uptodate(folio)) {
 			set_buffer_uptodate(bh);
 		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 			   !buffer_new(bh) &&
-			   ocfs2_should_read_blk(inode, page, block_start) &&
+			   ocfs2_should_read_blk(inode, folio, block_start) &&
 			   (block_start < from || block_end > to)) {
 			bh_read_nowait(bh, 0);
 			*wait_bh++=bh;
@@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		if (block_start >= to)
 			break;
 
-		zero_user(page, block_start, bh->b_size);
+		folio_zero_range(folio, block_start, bh->b_size);
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 
@@ -2048,9 +2049,9 @@ out_write_size:
 		}
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
 		di->i_size = cpu_to_le64((u64)i_size_read(inode));
-		inode->i_mtime = inode_set_ctime_current(inode);
-		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+		di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 		if (handle)
 			ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 196638a22b48..cdb9b9bdea1f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -158,7 +158,7 @@ read_failure:
 			if (new_bh && bh) {
 				/* If middle bh fails, let previous bh
 				 * finish its read and then put it to
-				 * aovoid bh leak
+				 * avoid bh leak
 				 */
 				if (!buffer_jbd(bh))
 					wait_on_buffer(bh);
@@ -345,7 +345,7 @@ read_failure:
 				if (new_bh && bh) {
 					/* If middle bh fails, let previous bh
 					 * finish its read and then put it to
-					 * aovoid bh leak
+					 * avoid bh leak
 					 */
 					if (!buffer_jbd(bh))
 						wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21472e3ed182..4d7efefa98c5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
 	unsigned int		hr_num_pages;
 
 	struct page             **hr_slot_data;
-	struct block_device	*hr_bdev;
+	struct bdev_handle	*hr_bdev_handle;
 	struct o2hb_disk_slot	*hr_slots;
 
 	/* live node map of this region */
@@ -261,6 +261,11 @@ struct o2hb_region {
 	int			hr_last_hb_status;
 };
 
+static inline struct block_device *reg_bdev(struct o2hb_region *reg)
+{
+	return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+}
+
 struct o2hb_bio_wait_ctxt {
 	atomic_t          wc_num_reqs;
 	struct completion wc_io_complete;
@@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work)
 			     hr_write_timeout_work.work);
 
 	mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
-	     "milliseconds\n", reg->hr_bdev,
+	     "milliseconds\n", reg_bdev(reg),
 	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
 
 	if (o2hb_global_heartbeat_active()) {
@@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
 			printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
 				o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
-				config_item_name(&reg->hr_item), reg->hr_bdev);
+				config_item_name(&reg->hr_item), reg_bdev(reg));
 			set_bit(master_node, reg->hr_nego_node_bitmap);
 		}
 		if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
@@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		}
 
 		printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
-			config_item_name(&reg->hr_item), reg->hr_bdev);
+			config_item_name(&reg->hr_item),
+			reg_bdev(reg));
 		/* approve negotiate timeout request. */
 		o2hb_arm_timeout(reg);
 
@@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
 		/* negotiate timeout with master node. */
 		printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
 			o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
-			reg->hr_bdev, master_node);
+			reg_bdev(reg), master_node);
 		ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
 				master_node);
 		if (ret)
@@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	nego_msg = (struct o2hb_nego_msg *)msg->buf;
 	printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
-		nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_bdev);
+		nego_msg->node_num, config_item_name(&reg->hr_item),
+		reg_bdev(reg));
 	if (nego_msg->node_num < O2NM_MAX_NODES)
 		set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
 	else
@@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct o2hb_region *reg = data;
 
 	printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
-		config_item_name(&reg->hr_item), reg->hr_bdev);
+		config_item_name(&reg->hr_item), reg_bdev(reg));
 	o2hb_arm_timeout(reg);
 	return 0;
 }
@@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	 * GFP_KERNEL that the local node can get fenced. It would be
 	 * nicest if we could pre-allocate these bios and avoid this
 	 * all together. */
-	bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC);
+	bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
 	if (!bio) {
 		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
 		bio = ERR_PTR(-ENOMEM);
@@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
 		errstr = ERRSTR3;
 
 	mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
-	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev,
+	     "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg),
 	     slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
 	     (unsigned long long)slot->ds_last_time, hb_block->hb_node,
 	     (unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
 		goto unlock;
 
 	printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
-	       config_item_name(&reg->hr_item), reg->hr_bdev);
+	       config_item_name(&reg->hr_item), reg_bdev(reg));
 
 	set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
 
@@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 		 * consider it a transient miss but don't populate any
 		 * other values as they may be junk. */
 		mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
-		     slot->ds_node_num, reg->hr_bdev);
+		     slot->ds_node_num, reg_bdev(reg));
 		o2hb_dump_slot(hb_block);
 
 		slot->ds_equal_samples++;
@@ -1003,8 +1010,8 @@ fire_callbacks:
 			     "of %u ms, but our count is %u ms.\n"
 			     "Please double check your configuration values "
 			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
-			     slot->ds_node_num, reg->hr_bdev, slot_dead_ms,
-			     dead_ms);
+			     slot->ds_node_num, reg_bdev(reg),
+			     slot_dead_ms, dead_ms);
 		}
 		goto out;
 	}
@@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		 * can't be sure that the new block ever made it to
 		 * disk */
 		mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
-		     write_wc.wc_error, reg->hr_bdev);
+		     write_wc.wc_error, reg_bdev(reg));
 		ret = write_wc.wc_error;
 		goto bail;
 	}
@@ -1169,7 +1176,7 @@ bail:
 			printk(KERN_NOTICE "o2hb: Unable to stabilize "
 			       "heartbeat on region %s (%pg)\n",
 			       config_item_name(&reg->hr_item),
-			       reg->hr_bdev);
+			       reg_bdev(reg));
 			atomic_set(&reg->hr_steady_iterations, 0);
 			reg->hr_aborted_start = 1;
 			wake_up(&o2hb_steady_queue);
@@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item)
 	struct page *page;
 	struct o2hb_region *reg = to_o2hb_region(item);
 
-	mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev);
+	mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg));
 
 	kfree(reg->hr_tmp_block);
 
@@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
 		kfree(reg->hr_slot_data);
 	}
 
-	if (reg->hr_bdev)
-		blkdev_put(reg->hr_bdev, NULL);
+	if (reg->hr_bdev_handle)
+		bdev_release(reg->hr_bdev_handle);
 
 	kfree(reg->hr_slots);
 
@@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
 	unsigned long block_bytes;
 	unsigned int block_bits;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
 	char *p = (char *)page;
 	ssize_t ret;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	ret = kstrtoull(p, 0, &tmp);
@@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
 	unsigned long tmp;
 	char *p = (char *)page;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		return -EINVAL;
 
 	tmp = simple_strtoul(p, &p, 0);
@@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 {
 	unsigned int ret = 0;
 
-	if (to_o2hb_region(item)->hr_bdev)
-		ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev);
+	if (to_o2hb_region(item)->hr_bdev_handle)
+		ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
 
 	return ret;
 }
@@ -1745,7 +1752,10 @@ out:
 	return ret;
 }
 
-/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+/*
+ * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * nothing
+ */
 static ssize_t o2hb_region_dev_store(struct config_item *item,
 				     const char *page,
 				     size_t count)
@@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	ssize_t ret = -EINVAL;
 	int live_threshold;
 
-	if (reg->hr_bdev)
+	if (reg->hr_bdev_handle)
 		goto out;
 
 	/* We can't heartbeat without having had our node number
@@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	if (!S_ISBLK(f.file->f_mapping->host->i_mode))
 		goto out2;
 
-	reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
-					 BLK_OPEN_WRITE | BLK_OPEN_READ, NULL,
-					 NULL);
-	if (IS_ERR(reg->hr_bdev)) {
-		ret = PTR_ERR(reg->hr_bdev);
-		reg->hr_bdev = NULL;
+	reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+			BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
+	if (IS_ERR(reg->hr_bdev_handle)) {
+		ret = PTR_ERR(reg->hr_bdev_handle);
+		reg->hr_bdev_handle = NULL;
 		goto out2;
 	}
 
-	sectsize = bdev_logical_block_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg_bdev(reg));
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
@@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 
 	if (hb_task && o2hb_global_heartbeat_active())
 		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
-		       config_item_name(&reg->hr_item), reg->hr_bdev);
+		       config_item_name(&reg->hr_item), reg_bdev(reg));
 
 out3:
 	if (ret < 0) {
-		blkdev_put(reg->hr_bdev, NULL);
-		reg->hr_bdev = NULL;
+		bdev_release(reg->hr_bdev_handle);
+		reg->hr_bdev_handle = NULL;
 	}
 out2:
 	fdput(f);
@@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
 		printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
 		       ((atomic_read(&reg->hr_steady_iterations) == 0) ?
 			"stopped" : "start aborted"), config_item_name(item),
-		       reg->hr_bdev);
+		       reg_bdev(reg));
 	}
 
 	/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8b123d543e6e..a14c8fee6ee5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle,
 				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
 
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
-			dir->i_mtime = inode_set_ctime_current(dir);
+			inode_set_mtime_to_ts(dir,
+					      inode_set_ctime_current(dir));
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 			if (retval < 0) {
 				mlog_errno(retval);
@@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	ocfs2_dinode_new_extent_list(dir, di);
 
 	i_size_write(dir, sb->s_blocksize);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	di->i_size = cpu_to_le64(sb->s_blocksize);
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
 	ocfs2_update_inode_fsync_trans(handle, dir, 1);
 
 	/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 81265123ce6c..85215162c9dd 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -80,8 +80,7 @@ static int param_set_dlmfs_capabilities(const char *val,
 static int param_get_dlmfs_capabilities(char *buffer,
 					const struct kernel_param *kp)
 {
-	return strlcpy(buffer, DLMFS_CAPABILITIES,
-		       strlen(DLMFS_CAPABILITIES) + 1);
+	return sysfs_emit(buffer, DLMFS_CAPABILITIES);
 }
 module_param_call(capabilities, param_set_dlmfs_capabilities,
 		  param_get_dlmfs_capabilities, NULL, 0444);
@@ -337,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		inc_nlink(inode);
 
 		inode->i_fop = &simple_dir_operations;
@@ -360,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 	inode->i_ino = get_next_ino();
 	inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	ip = DLMFS_I(inode);
 	ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c3e2961ee5db..64a6ef638495 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,7 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
-	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 ts;
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
@@ -2183,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
-	lvb->lvb_iatime_packed  =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
-	lvb->lvb_ictime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&ctime));
-	lvb->lvb_imtime_packed =
-		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+	ts = inode_get_atime(inode);
+	lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = inode_get_ctime(inode);
+	lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+	ts = inode_get_mtime(inode);
+	lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
 	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
 	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2209,7 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 	mlog_meta_lvb(0, lockres);
 
@@ -2236,13 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
 	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
-	ocfs2_unpack_timespec(&inode->i_atime,
-			      be64_to_cpu(lvb->lvb_iatime_packed));
-	ocfs2_unpack_timespec(&inode->i_mtime,
-			      be64_to_cpu(lvb->lvb_imtime_packed));
-	ocfs2_unpack_timespec(&ctime,
-			      be64_to_cpu(lvb->lvb_ictime_packed));
-	inode_set_ctime_to_ts(inode, ctime);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+	inode_set_atime_to_ts(inode, ts);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+	inode_set_mtime_to_ts(inode, ts);
+	ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+	inode_set_ctime_to_ts(inode, ts);
 	spin_unlock(&oi->ip_lock);
 	return 0;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c45596c25c66..94e2a1244442 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -233,16 +233,18 @@ int ocfs2_should_update_atime(struct inode *inode,
 
 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
 		struct timespec64 ctime = inode_get_ctime(inode);
+		struct timespec64 atime = inode_get_atime(inode);
+		struct timespec64 mtime = inode_get_mtime(inode);
 
-		if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
-		    (timespec64_compare(&inode->i_atime, &ctime) <= 0))
+		if ((timespec64_compare(&atime, &mtime) <= 0) ||
+		    (timespec64_compare(&atime, &ctime) <= 0))
 			return 1;
 
 		return 0;
 	}
 
 	now = current_time(inode);
-	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+	if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
 		return 0;
 	else
 		return 1;
@@ -275,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	 * have i_rwsem to guard against concurrent changes to other
 	 * inode fields.
 	 */
-	inode->i_atime = current_time(inode);
-	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+	inode_set_atime_to_ts(inode, current_time(inode));
+	di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 	ocfs2_journal_dirty(handle, bh);
 
@@ -296,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle,
 
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
@@ -417,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	}
 
 	i_size_write(inode, new_i_size);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 	di = (struct ocfs2_dinode *) fe_bh->b_data;
 	di->i_size = cpu_to_le64(new_i_size);
-	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, fe_bh);
@@ -821,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	i_size_write(inode, abs_to);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	di->i_size = cpu_to_le64((u64)i_size_read(inode));
-	inode->i_mtime = inode_set_ctime_current(inode);
-	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	di->i_mtime_nsec = di->i_ctime_nsec;
 	if (handle) {
 		ocfs2_journal_dirty(handle, di_bh);
@@ -2040,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 		goto out_inode_unlock;
 	}
 
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
 	if (ret < 0)
 		mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e8771600b930..999111bfc271 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -302,10 +302,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
 		inode->i_mapping->a_ops = &ocfs2_aops;
 	}
-	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
-	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
-	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
-	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+		        le32_to_cpu(fe->i_atime_nsec));
+	inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+		        le32_to_cpu(fe->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
 		        le32_to_cpu(fe->i_ctime_nsec));
 
@@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	fe->i_uid = cpu_to_le32(i_uid_read(inode));
 	fe->i_gid = cpu_to_le32(i_gid_read(inode));
 	fe->i_mode = cpu_to_le16(inode->i_mode);
-	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
-	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
-	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+	fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+	fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 
 	ocfs2_journal_dirty(handle, bh);
 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
@@ -1348,10 +1348,10 @@ void ocfs2_refresh_inode(struct inode *inode,
 		inode->i_blocks = 0;
 	else
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
-	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
-	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
-	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
-	inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+	inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+			le32_to_cpu(fe->i_atime_nsec));
+	inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+			le32_to_cpu(fe->i_mtime_nsec));
 	inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
 			le32_to_cpu(fe->i_ctime_nsec));
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ce215565d061..604fea3a26ff 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -90,7 +90,7 @@ enum ocfs2_replay_state {
 struct ocfs2_replay_map {
 	unsigned int rm_slots;
 	enum ocfs2_replay_state rm_state;
-	unsigned char rm_replay_slots[];
+	unsigned char rm_replay_slots[] __counted_by(rm_slots);
 };
 
 static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 05d67968a3a9..1f9ed117e78b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -951,8 +951,8 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	inode_set_ctime_current(inode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
 
 	ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5cd6d7771cea..814733ba2f4b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -795,8 +795,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 	inc_nlink(inode);
 	inode_set_ctime_current(inode);
 	ocfs2_set_links_count(fe, inode->i_nlink);
-	fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 	ocfs2_journal_dirty(handle, fe_bh);
 
 	err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
 	ocfs2_set_links_count(fe, inode->i_nlink);
 	ocfs2_journal_dirty(handle, fe_bh);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (S_ISDIR(inode->i_mode))
 		drop_nlink(dir);
 
@@ -1550,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 	if (status >= 0) {
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 
-		old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
-		old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+		old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+		old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
 		ocfs2_journal_dirty(handle, old_inode_bh);
 	} else
 		mlog_errno(status);
@@ -1592,11 +1592,15 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 		drop_nlink(new_inode);
 		inode_set_ctime_current(new_inode);
 	}
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
 
 	if (update_dot_dot) {
 		status = ocfs2_update_entry(old_inode, handle,
 					    &old_inode_dot_dot_res, new_dir);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
 		drop_nlink(old_dir);
 		if (new_inode) {
 			drop_nlink(new_inode);
@@ -1614,8 +1618,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 
 	if (old_dir != new_dir) {
 		/* Keep the same times on both directories.*/
-		new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
-							 inode_get_ctime(old_dir));
+		inode_set_mtime_to_ts(new_dir,
+				      inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
 
 		/*
 		 * This will also pick up the i_nlink change from the
@@ -1636,6 +1640,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
 							 INODE_CACHE(old_dir),
 							 old_dir_bh,
 							 OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			ocfs2_set_links_count(fe, old_dir->i_nlink);
 			ocfs2_journal_dirty(handle, old_dir_bh);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dfaae1e52412..e09842fc9d4d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1240,6 +1240,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot)
 				     &od->dq_local_phys_blk,
 				     &pcount,
 				     NULL);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
 
 	/* Initialize dquot structure on disk */
 	status = ocfs2_local_write_dquot(dquot);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 25c8ec3c8c3a..3f80a56d0d60 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3751,8 +3751,8 @@ static int ocfs2_change_ctime(struct inode *inode,
 	}
 
 	inode_set_ctime_current(inode);
-	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 
 	ocfs2_journal_dirty(handle, di_bh);
 
@@ -4075,10 +4075,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 		 */
 		inode_set_ctime_current(t_inode);
 
-		di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
+		di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode));
+		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode));
 
-		t_inode->i_mtime = s_inode->i_mtime;
+		inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode));
 		di->i_mtime = s_di->i_mtime;
 		di->i_mtime_nsec = s_di->i_mtime_nsec;
 	}
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
 	if (newlen > i_size_read(dest))
 		i_size_write(dest, newlen);
 	spin_unlock(&OCFS2_I(dest)->ip_lock);
-	dest->i_mtime = inode_set_ctime_current(dest);
+	inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest));
 
 	ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
 	if (ret) {
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index da7718cef735..e544c704b583 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -37,7 +37,7 @@ struct ocfs2_slot_info {
 	unsigned int si_blocks;
 	struct buffer_head **si_bh;
 	unsigned int si_num_slots;
-	struct ocfs2_slot si_slots[];
+	struct ocfs2_slot si_slots[] __counted_by(si_num_slots);
 };
 
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6510ad783c91..3b81213ed7b8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
 };
 
-const struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler * const ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
 	&ocfs2_xattr_trusted_handler,
 	&ocfs2_xattr_security_handler,
 	NULL
 };
 
-static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]		= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]	= &nop_posix_acl_access,
 	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &nop_posix_acl_default,
@@ -3422,8 +3422,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 		}
 
 		inode_set_ctime_current(inode);
-		di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+		di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+		di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
 		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
 	}
 out:
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..65e9aa743919 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info {
 extern const struct xattr_handler ocfs2_xattr_user_handler;
 extern const struct xattr_handler ocfs2_xattr_trusted_handler;
 extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler * const ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 2f8c1882f45c..d6cd81163030 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_mapping->a_ops = &omfs_aops;
 
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 		inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
 	oi->i_head.h_magic = OMFS_IMAGIC;
 	oi->i_size = cpu_to_be64(inode->i_size);
 
-	ctime = inode_get_ctime(inode).tv_sec * 1000LL +
-		((inode_get_ctime(inode).tv_nsec + 999)/1000);
+	ctime = inode_get_ctime_sec(inode) * 1000LL +
+		((inode_get_ctime_nsec(inode) + 999)/1000);
 	oi->i_ctime = cpu_to_be64(ctime);
 
 	omfs_update_checksums(oi);
@@ -230,11 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
 	ctime = be64_to_cpu(oi->i_ctime);
 	nsecs = do_div(ctime, 1000) * 1000L;
 
-	inode->i_atime.tv_sec = ctime;
-	inode->i_mtime.tv_sec = ctime;
+	inode_set_atime(inode, ctime, nsecs);
+	inode_set_mtime(inode, ctime, nsecs);
 	inode_set_ctime(inode, ctime, nsecs);
-	inode->i_atime.tv_nsec = nsecs;
-	inode->i_mtime.tv_nsec = nsecs;
 
 	inode->i_mapping->a_ops = &omfs_aops;
 
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..3494a9cd8046 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	return ksys_fchown(fd, user, group);
 }
 
+static inline int file_get_write_access(struct file *f)
+{
+	int error;
+
+	error = get_write_access(f->f_inode);
+	if (unlikely(error))
+		return error;
+	error = mnt_get_write_access(f->f_path.mnt);
+	if (unlikely(error))
+		goto cleanup_inode;
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
+		if (unlikely(error))
+			goto cleanup_mnt;
+	}
+	return 0;
+
+cleanup_mnt:
+	mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+	put_write_access(f->f_inode);
+	return error;
+}
+
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_inc(inode);
 	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
-		error = get_write_access(inode);
+		error = file_get_write_access(f);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = __mnt_want_write(f->f_path.mnt);
-		if (unlikely(error)) {
-			put_write_access(inode);
-			goto cleanup_file;
-		}
 		f->f_mode |= FMODE_WRITER;
 	}
 
@@ -1069,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags,
 	int error;
 	struct file *f;
 
-	validate_creds(cred);
-
 	/* We must always pass in a valid mount pointer. */
 	BUG_ON(!path->mnt);
 
@@ -1109,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	struct file *f;
 	int error;
 
-	validate_creds(cred);
 	f = alloc_empty_file(flags, cred);
 	if (IS_ERR(f))
 		return f;
@@ -1163,20 +1179,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
 
 /**
  * backing_file_open - open a backing file for kernel internal use
- * @path:	path of the file to open
+ * @user_path:	path that the user reuqested to open
  * @flags:	open flags
  * @real_path:	path of the backing file
  * @cred:	credentials for open
  *
  * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem.  In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
  */
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred)
 {
@@ -1187,9 +1202,9 @@ struct file *backing_file_open(const struct path *path, int flags,
 	if (IS_ERR(f))
 		return f;
 
-	f->f_path = *path;
-	path_get(real_path);
-	*backing_file_real_path(f) = *real_path;
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	f->f_path = *real_path;
 	error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
 	if (error) {
 		fput(f);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index b2457cb97fa0..c4b65a6d41cc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (inode->i_state & I_NEW) {
-		inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		ent_oi = OP_I(inode);
 		ent_oi->type = ent_type;
 		ent_oi->u = ent_data;
@@ -387,7 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
 		goto out_no_root;
 	}
 
-	root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
+	simple_inode_init_ts(root_inode);
 	root_inode->i_op = &openprom_inode_operations;
 	root_inode->i_fop = &openprom_operations;
 	root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b711654ca18a..926d9c0a428a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,7 +103,7 @@ enum orangefs_vfs_op_states {
 #define ORANGEFS_CACHE_CREATE_FLAGS 0
 #endif
 
-extern const struct xattr_handler *orangefs_xattr_handlers[];
+extern const struct xattr_handler * const orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
 extern int orangefs_set_acl(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0a9fcfdf552f..0fdceb00ca07 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode,
 	if (orangefs_inode->attr_valid & ATTR_ATIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
 		if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
-			attrs->atime = (time64_t)inode->i_atime.tv_sec;
+			attrs->atime = (time64_t) inode_get_atime_sec(inode);
 			attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
 		}
 	}
 	if (orangefs_inode->attr_valid & ATTR_MTIME) {
 		attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
 		if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
-			attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
+			attrs->mtime = (time64_t) inode_get_mtime_sec(inode);
 			attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
 		}
 	}
@@ -357,15 +357,15 @@ again2:
 	    downcall.resp.getattr.attributes.owner);
 	inode->i_gid = make_kgid(&init_user_ns, new_op->
 	    downcall.resp.getattr.attributes.group);
-	inode->i_atime.tv_sec = (time64_t)new_op->
-	    downcall.resp.getattr.attributes.atime;
-	inode->i_mtime.tv_sec = (time64_t)new_op->
-	    downcall.resp.getattr.attributes.mtime;
+	inode_set_atime(inode,
+			(time64_t)new_op->downcall.resp.getattr.attributes.atime,
+			0);
+	inode_set_mtime(inode,
+			(time64_t)new_op->downcall.resp.getattr.attributes.mtime,
+			0);
 	inode_set_ctime(inode,
 			(time64_t)new_op->downcall.resp.getattr.attributes.ctime,
 			0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 
 	/* special case: mark the root inode as sticky */
 	inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 68b62689a63e..74ef75586f38 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = {
 	.set = orangefs_xattr_set_default,
 };
 
-const struct xattr_handler *orangefs_xattr_handlers[] = {
+const struct xattr_handler * const orangefs_xattr_handlers[] = {
 	&orangefs_xattr_default_handler,
 	NULL
 };
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 4e173d56b11f..5648954f8588 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -6,4 +6,4 @@
 obj-$(CONFIG_OVERLAY_FS) += overlay.o
 
 overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \
-		copy_up.o export.o params.o
+		copy_up.o export.o params.o xattrs.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ada3fcc9c6d5..8bea66c97316 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 		return PTR_ERR(old_file);
 
 	/* Try to use clone_file_range to clone up within the same fs */
+	ovl_start_write(dentry);
 	cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+	ovl_end_write(dentry);
 	if (cloned == len)
 		goto out_fput;
 	/* Couldn't clone, so now we try to copy the data */
@@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 		 * it may not recognize all kind of holes and sometimes
 		 * only skips partial of hole area. However, it will be
 		 * enough for most of the use cases.
+		 *
+		 * We do not hold upper sb_writers throughout the loop to avert
+		 * lockdep warning with llseek of lower file in nested overlay:
+		 * - upper sb_writers
+		 * -- lower ovl_inode_lock (ovl_llseek)
 		 */
-
 		if (skip_hole && data_pos < old_pos) {
 			data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
 			if (data_pos > old_pos) {
@@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
 			}
 		}
 
+		ovl_start_write(dentry);
 		bytes = do_splice_direct(old_file, &old_pos,
 					 new_file, &new_pos,
 					 this_len, SPLICE_F_MOVE);
+		ovl_end_write(dentry);
 		if (bytes <= 0) {
 			error = bytes;
 			break;
@@ -426,29 +434,29 @@ out_err:
 	return ERR_PTR(err);
 }
 
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
-		   struct dentry *upper)
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
 {
-	const struct ovl_fh *fh = NULL;
-	int err;
-
 	/*
 	 * When lower layer doesn't support export operations store a 'null' fh,
 	 * so we can use the overlay.origin xattr to distignuish between a copy
 	 * up and a pure upper inode.
 	 */
-	if (ovl_can_decode_fh(lower->d_sb)) {
-		fh = ovl_encode_real_fh(ofs, lower, false);
-		if (IS_ERR(fh))
-			return PTR_ERR(fh);
-	}
+	if (!ovl_can_decode_fh(origin->d_sb))
+		return NULL;
+
+	return ovl_encode_real_fh(ofs, origin, false);
+}
+
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+		      struct dentry *upper)
+{
+	int err;
 
 	/*
 	 * Do not fail when upper doesn't support xattrs.
 	 */
 	err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
 				 fh ? fh->fb.len : 0, 0);
-	kfree(fh);
 
 	/* Ignore -EPERM from setting "user.*" on symlink/special */
 	return err == -EPERM ? 0 : err;
@@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
  *
  * Caller must hold i_mutex on indexdir.
  */
-static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
+static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
 			    struct dentry *upper)
 {
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
 	if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
 		return -EIO;
 
-	err = ovl_get_index_name(ofs, origin, &name);
+	err = ovl_get_index_name_fh(fh, &name);
 	if (err)
 		return err;
 
@@ -541,6 +549,7 @@ struct ovl_copy_up_ctx {
 	struct dentry *destdir;
 	struct qstr destname;
 	struct dentry *workdir;
+	const struct ovl_fh *origin_fh;
 	bool origin;
 	bool indexed;
 	bool metacopy;
@@ -555,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
 	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
 	struct inode *udir = d_inode(upperdir);
 
+	ovl_start_write(c->dentry);
+
 	/* Mark parent "impure" because it may now contain non-pure upper */
 	err = ovl_set_impure(c->parent, upperdir);
 	if (err)
-		return err;
+		goto out;
 
 	err = ovl_set_nlink_lower(c->dentry);
 	if (err)
-		return err;
+		goto out;
 
 	inode_lock_nested(udir, I_MUTEX_PARENT);
 	upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
@@ -581,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
 	}
 	inode_unlock(udir);
 	if (err)
-		return err;
+		goto out;
 
 	err = ovl_set_nlink_upper(c->dentry);
 
+out:
+	ovl_end_write(c->dentry);
 	return err;
 }
 
@@ -637,7 +650,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
 	 * hard link.
 	 */
 	if (c->origin) {
-		err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
+		err = ovl_set_origin_fh(ofs, c->origin_fh, temp);
 		if (err)
 			return err;
 	}
@@ -719,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 		.link = c->link
 	};
 
-	/* workdir and destdir could be the same when copying up to indexdir */
-	err = -EIO;
-	if (lock_rename(c->workdir, c->destdir) != NULL)
-		goto unlock;
-
 	err = ovl_prep_cu_creds(c->dentry, &cc);
 	if (err)
-		goto unlock;
+		return err;
 
+	ovl_start_write(c->dentry);
+	inode_lock(wdir);
 	temp = ovl_create_temp(ofs, c->workdir, &cattr);
+	inode_unlock(wdir);
+	ovl_end_write(c->dentry);
 	ovl_revert_cu_creds(&cc);
 
-	err = PTR_ERR(temp);
 	if (IS_ERR(temp))
-		goto unlock;
+		return PTR_ERR(temp);
 
 	/*
 	 * Copy up data first and then xattrs. Writing data after
@@ -741,15 +752,29 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 	 */
 	path.dentry = temp;
 	err = ovl_copy_up_data(c, &path);
-	if (err)
+	/*
+	 * We cannot hold lock_rename() throughout this helper, because of
+	 * lock ordering with sb_writers, which shouldn't be held when calling
+	 * ovl_copy_up_data(), so lock workdir and destdir and make sure that
+	 * temp wasn't moved before copy up completion or cleanup.
+	 */
+	ovl_start_write(c->dentry);
+	if (lock_rename(c->workdir, c->destdir) != NULL ||
+	    temp->d_parent != c->workdir) {
+		/* temp or workdir moved underneath us? abort without cleanup */
+		dput(temp);
+		err = -EIO;
+		goto unlock;
+	} else if (err) {
 		goto cleanup;
+	}
 
 	err = ovl_copy_up_metadata(c, temp);
 	if (err)
 		goto cleanup;
 
 	if (S_ISDIR(c->stat.mode) && c->indexed) {
-		err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
+		err = ovl_create_index(c->dentry, c->origin_fh, temp);
 		if (err)
 			goto cleanup;
 	}
@@ -779,6 +804,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 		ovl_set_flag(OVL_WHITEOUTS, inode);
 unlock:
 	unlock_rename(c->workdir, c->destdir);
+	ovl_end_write(c->dentry);
 
 	return err;
 
@@ -802,9 +828,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 	if (err)
 		return err;
 
+	ovl_start_write(c->dentry);
 	tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+	ovl_end_write(c->dentry);
 	ovl_revert_cu_creds(&cc);
-
 	if (IS_ERR(tmpfile))
 		return PTR_ERR(tmpfile);
 
@@ -815,9 +842,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 			goto out_fput;
 	}
 
+	ovl_start_write(c->dentry);
+
 	err = ovl_copy_up_metadata(c, temp);
 	if (err)
-		goto out_fput;
+		goto out;
 
 	inode_lock_nested(udir, I_MUTEX_PARENT);
 
@@ -831,7 +860,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 	inode_unlock(udir);
 
 	if (err)
-		goto out_fput;
+		goto out;
 
 	if (c->metacopy_digest)
 		ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
@@ -843,6 +872,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 		ovl_set_upperdata(d_inode(c->dentry));
 	ovl_inode_update(d_inode(c->dentry), dget(temp));
 
+out:
+	ovl_end_write(c->dentry);
 out_fput:
 	fput(tmpfile);
 	return err;
@@ -861,6 +892,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
 {
 	int err;
 	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
+	struct dentry *origin = c->lowerpath.dentry;
+	struct ovl_fh *fh = NULL;
 	bool to_index = false;
 
 	/*
@@ -877,25 +910,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
 			to_index = true;
 	}
 
-	if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
+	if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) {
+		fh = ovl_get_origin_fh(ofs, origin);
+		if (IS_ERR(fh))
+			return PTR_ERR(fh);
+
+		/* origin_fh may be NULL */
+		c->origin_fh = fh;
 		c->origin = true;
+	}
 
 	if (to_index) {
 		c->destdir = ovl_indexdir(c->dentry->d_sb);
-		err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
+		err = ovl_get_index_name(ofs, origin, &c->destname);
 		if (err)
-			return err;
+			goto out_free_fh;
 	} else if (WARN_ON(!c->parent)) {
 		/* Disconnected dentry must be copied up to index dir */
-		return -EIO;
+		err = -EIO;
+		goto out_free_fh;
 	} else {
 		/*
 		 * Mark parent "impure" because it may now contain non-pure
 		 * upper
 		 */
+		ovl_start_write(c->dentry);
 		err = ovl_set_impure(c->parent, c->destdir);
+		ovl_end_write(c->dentry);
 		if (err)
-			return err;
+			goto out_free_fh;
 	}
 
 	/* Should we copyup with O_TMPFILE or with workdir? */
@@ -909,6 +952,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
 	if (c->indexed)
 		ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
 
+	ovl_start_write(c->dentry);
 	if (to_index) {
 		/* Initialize nlink for copy up of disconnected dentry */
 		err = ovl_set_nlink_upper(c->dentry);
@@ -923,10 +967,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
 		ovl_dentry_set_upper_alias(c->dentry);
 		ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry));
 	}
+	ovl_end_write(c->dentry);
 
 out:
 	if (to_index)
 		kfree(c->destname.name);
+out_free_fh:
+	kfree(fh);
 	return err;
 }
 
@@ -1011,15 +1058,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 	 * Writing to upper file will clear security.capability xattr. We
 	 * don't want that to happen for normal copy-up operation.
 	 */
+	ovl_start_write(c->dentry);
 	if (capability) {
 		err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
 				      capability, cap_size, 0);
-		if (err)
-			goto out_free;
 	}
-
-
-	err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+	if (!err) {
+		err = ovl_removexattr(ofs, upperpath.dentry,
+				      OVL_XATTR_METACOPY);
+	}
+	ovl_end_write(c->dentry);
 	if (err)
 		goto out_free;
 
@@ -1170,17 +1218,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
 
 int ovl_maybe_copy_up(struct dentry *dentry, int flags)
 {
-	int err = 0;
-
-	if (ovl_open_need_copy_up(dentry, flags)) {
-		err = ovl_want_write(dentry);
-		if (!err) {
-			err = ovl_copy_up_flags(dentry, flags);
-			ovl_drop_write(dentry);
-		}
-	}
+	if (!ovl_open_need_copy_up(dentry, flags))
+		return 0;
 
-	return err;
+	return ovl_copy_up_flags(dentry, flags);
 }
 
 int ovl_copy_up_with_data(struct dentry *dentry)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 033fc0458a3d..aab3f5d93556 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 		goto out_unlock;
 
 	err = -ESTALE;
-	if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
+	if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
 		goto out_dput;
 
 	newdentry = ovl_create_temp(ofs, workdir, cattr);
@@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 	struct cred *override_cred;
 	struct dentry *parent = dentry->d_parent;
 
-	err = ovl_copy_up(parent);
-	if (err)
-		return err;
-
 	old_cred = ovl_override_creds(dentry->d_sb);
 
 	/*
@@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 		.link = link,
 	};
 
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		return err;
+
 	err = ovl_want_write(dentry);
 	if (err)
 		goto out;
@@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
 	int err;
 	struct inode *inode;
 
-	err = ovl_want_write(old);
+	err = ovl_copy_up(old);
 	if (err)
 		goto out;
 
-	err = ovl_copy_up(old);
+	err = ovl_copy_up(new->d_parent);
 	if (err)
-		goto out_drop_write;
+		goto out;
 
-	err = ovl_copy_up(new->d_parent);
+	err = ovl_nlink_start(old);
 	if (err)
-		goto out_drop_write;
+		goto out;
 
 	if (ovl_is_metacopy_dentry(old)) {
 		err = ovl_set_link_redirect(old);
 		if (err)
-			goto out_drop_write;
+			goto out_nlink_end;
 	}
 
-	err = ovl_nlink_start(old);
-	if (err)
-		goto out_drop_write;
-
 	inode = d_inode(old);
 	ihold(inode);
 
@@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
 	if (err)
 		iput(inode);
 
+out_nlink_end:
 	ovl_nlink_end(old);
-out_drop_write:
-	ovl_drop_write(old);
 out:
 	return err;
 }
@@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 			goto out;
 	}
 
-	err = ovl_want_write(dentry);
-	if (err)
-		goto out;
-
 	err = ovl_copy_up(dentry->d_parent);
 	if (err)
-		goto out_drop_write;
+		goto out;
 
 	err = ovl_nlink_start(dentry);
 	if (err)
-		goto out_drop_write;
+		goto out;
 
 	old_cred = ovl_override_creds(dentry->d_sb);
 	if (!lower_positive)
@@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 	if (ovl_dentry_upper(dentry))
 		ovl_copyattr(d_inode(dentry));
 
-out_drop_write:
-	ovl_drop_write(dentry);
 out:
 	ovl_cache_free(&list);
 	return err;
@@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 		}
 	}
 
-	err = ovl_want_write(old);
-	if (err)
-		goto out;
-
 	err = ovl_copy_up(old);
 	if (err)
-		goto out_drop_write;
+		goto out;
 
 	err = ovl_copy_up(new->d_parent);
 	if (err)
-		goto out_drop_write;
+		goto out;
 	if (!overwrite) {
 		err = ovl_copy_up(new);
 		if (err)
-			goto out_drop_write;
+			goto out;
 	} else if (d_inode(new)) {
 		err = ovl_nlink_start(new);
 		if (err)
-			goto out_drop_write;
+			goto out;
 
 		update_nlink = true;
 	}
 
+	if (!update_nlink) {
+		/* ovl_nlink_start() took ovl_want_write() */
+		err = ovl_want_write(old);
+		if (err)
+			goto out;
+	}
+
 	old_cred = ovl_override_creds(old->d_sb);
 
 	if (!list_empty(&list)) {
@@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 		}
 	} else {
 		if (!d_is_negative(newdentry)) {
-			if (!new_opaque || !ovl_is_whiteout(newdentry))
+			if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
 				goto out_dput;
 		} else {
 			if (flags & RENAME_EXCHANGE)
@@ -1286,8 +1278,8 @@ out_revert_creds:
 	revert_creds(old_cred);
 	if (update_nlink)
 		ovl_nlink_end(new);
-out_drop_write:
-	ovl_drop_write(old);
+	else
+		ovl_drop_write(old);
 out:
 	dput(opaquedir);
 	ovl_cache_free(&list);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 26b782c53910..7e16bbcad95e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry)
 	if (ovl_dentry_upper(dentry))
 		return 0;
 
-	err = ovl_want_write(dentry);
-	if (!err) {
-		err = ovl_copy_up(dentry);
-		ovl_drop_write(dentry);
-	}
-
+	err = ovl_copy_up(dentry);
 	if (err) {
 		pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
 				    dentry, err);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 8be4dc050d1e..131621daeb13 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -15,10 +15,15 @@
 #include <linux/fs.h>
 #include "overlayfs.h"
 
+#include "../internal.h"	/* for sb_init_dio_done_wq */
+
 struct ovl_aio_req {
 	struct kiocb iocb;
 	refcount_t ref;
 	struct kiocb *orig_iocb;
+	/* used for aio completion */
+	struct work_struct work;
+	long res;
 };
 
 static struct kmem_cache *ovl_aio_request_cachep;
@@ -235,10 +240,17 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
 	return ret;
 }
 
+static void ovl_file_modified(struct file *file)
+{
+	/* Update size/mtime */
+	ovl_copyattr(file_inode(file));
+}
+
 static void ovl_file_accessed(struct file *file)
 {
 	struct inode *inode, *upperinode;
 	struct timespec64 ctime, uctime;
+	struct timespec64 mtime, umtime;
 
 	if (file->f_flags & O_NOATIME)
 		return;
@@ -251,29 +263,23 @@ static void ovl_file_accessed(struct file *file)
 
 	ctime = inode_get_ctime(inode);
 	uctime = inode_get_ctime(upperinode);
-	if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
-	     !timespec64_equal(&ctime, &uctime))) {
-		inode->i_mtime = upperinode->i_mtime;
+	mtime = inode_get_mtime(inode);
+	umtime = inode_get_mtime(upperinode);
+	if ((!timespec64_equal(&mtime, &umtime)) ||
+	     !timespec64_equal(&ctime, &uctime)) {
+		inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode));
 		inode_set_ctime_to_ts(inode, uctime);
 	}
 
 	touch_atime(&file->f_path);
 }
 
-static rwf_t ovl_iocb_to_rwf(int ifl)
+#define OVL_IOCB_MASK \
+	(IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
 {
-	rwf_t flags = 0;
-
-	if (ifl & IOCB_NOWAIT)
-		flags |= RWF_NOWAIT;
-	if (ifl & IOCB_HIPRI)
-		flags |= RWF_HIPRI;
-	if (ifl & IOCB_DSYNC)
-		flags |= RWF_DSYNC;
-	if (ifl & IOCB_SYNC)
-		flags |= RWF_SYNC;
-
-	return flags;
+	return (__force rwf_t)(flags & OVL_IOCB_MASK);
 }
 
 static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
@@ -290,10 +296,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 	struct kiocb *orig_iocb = aio_req->orig_iocb;
 
 	if (iocb->ki_flags & IOCB_WRITE) {
-		struct inode *inode = file_inode(orig_iocb->ki_filp);
-
 		kiocb_end_write(iocb);
-		ovl_copyattr(inode);
+		ovl_file_modified(orig_iocb->ki_filp);
 	}
 
 	orig_iocb->ki_pos = iocb->ki_pos;
@@ -310,6 +314,37 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
 	orig_iocb->ki_complete(orig_iocb, res);
 }
 
+static void ovl_aio_complete_work(struct work_struct *work)
+{
+	struct ovl_aio_req *aio_req = container_of(work,
+						   struct ovl_aio_req, work);
+
+	ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
+}
+
+static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
+{
+	struct ovl_aio_req *aio_req = container_of(iocb,
+						   struct ovl_aio_req, iocb);
+	struct kiocb *orig_iocb = aio_req->orig_iocb;
+
+	/*
+	 * Punt to a work queue to serialize updates of mtime/size.
+	 */
+	aio_req->res = res;
+	INIT_WORK(&aio_req->work, ovl_aio_complete_work);
+	queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+		   &aio_req->work);
+}
+
+static int ovl_init_aio_done_wq(struct super_block *sb)
+{
+	if (sb->s_dio_done_wq)
+		return 0;
+
+	return sb_init_dio_done_wq(sb);
+}
+
 static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
@@ -331,8 +366,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
 	if (is_sync_kiocb(iocb)) {
-		ret = vfs_iter_read(real.file, iter, &iocb->ki_pos,
-				    ovl_iocb_to_rwf(iocb->ki_flags));
+		rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
+
+		ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
 	} else {
 		struct ovl_aio_req *aio_req;
 
@@ -398,15 +434,20 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
 	if (is_sync_kiocb(iocb)) {
+		rwf_t rwf = iocb_to_rw_flags(ifl);
+
 		file_start_write(real.file);
-		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
-				     ovl_iocb_to_rwf(ifl));
+		ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
 		file_end_write(real.file);
 		/* Update size */
-		ovl_copyattr(inode);
+		ovl_file_modified(file);
 	} else {
 		struct ovl_aio_req *aio_req;
 
+		ret = ovl_init_aio_done_wq(inode->i_sb);
+		if (ret)
+			goto out;
+
 		ret = -ENOMEM;
 		aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
 		if (!aio_req)
@@ -415,7 +456,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		aio_req->orig_iocb = iocb;
 		kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
 		aio_req->iocb.ki_flags = ifl;
-		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
+		aio_req->iocb.ki_complete = ovl_aio_queue_completion;
 		refcount_set(&aio_req->ref, 2);
 		kiocb_start_write(&aio_req->iocb);
 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
@@ -489,7 +530,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 	file_end_write(real.file);
 	/* Update size */
-	ovl_copyattr(inode);
+	ovl_file_modified(out);
 	revert_creds(old_cred);
 	fdput(real);
 
@@ -570,7 +611,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
 	revert_creds(old_cred);
 
 	/* Update size */
-	ovl_copyattr(inode);
+	ovl_file_modified(file);
 
 	fdput(real);
 
@@ -654,7 +695,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	revert_creds(old_cred);
 
 	/* Update size */
-	ovl_copyattr(inode_out);
+	ovl_file_modified(file_out);
 
 	fdput(real_in);
 	fdput(real_out);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 83ef66644c21..c63b31a460be 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (err)
 		return err;
 
-	err = ovl_want_write(dentry);
-	if (err)
-		goto out;
-
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncate should trigger data copy up as well */
 		full_copy_up = true;
@@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 			winode = d_inode(upperdentry);
 			err = get_write_access(winode);
 			if (err)
-				goto out_drop_write;
+				goto out;
 		}
 
 		if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		 */
 		attr->ia_valid &= ~ATTR_OPEN;
 
+		err = ovl_want_write(dentry);
+		if (err)
+			goto out_put_write;
+
 		inode_lock(upperdentry->d_inode);
 		old_cred = ovl_override_creds(dentry->d_sb);
 		err = ovl_do_notify_change(ofs, upperdentry, attr);
@@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		if (!err)
 			ovl_copyattr(dentry->d_inode);
 		inode_unlock(upperdentry->d_inode);
+		ovl_drop_write(dentry);
 
+out_put_write:
 		if (winode)
 			put_write_access(winode);
 	}
-out_drop_write:
-	ovl_drop_write(dentry);
 out:
 	return err;
 }
@@ -171,7 +171,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 
 	type = ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat, request_mask, flags);
+	err = ovl_do_getattr(&realpath, stat, request_mask, flags);
 	if (err)
 		goto out;
 
@@ -196,8 +196,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 					(!is_dir ? STATX_NLINK : 0);
 
 			ovl_path_lower(dentry, &realpath);
-			err = vfs_getattr(&realpath, &lowerstat,
-					  lowermask, flags);
+			err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
+					     flags);
 			if (err)
 				goto out;
 
@@ -249,8 +249,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
 
 			ovl_path_lowerdata(dentry, &realpath);
 			if (realpath.dentry) {
-				err = vfs_getattr(&realpath, &lowerdatastat,
-						  lowermask, flags);
+				err = ovl_do_getattr(&realpath, &lowerdatastat,
+						     lowermask, flags);
 				if (err)
 					goto out;
 			} else {
@@ -339,130 +339,6 @@ static const char *ovl_get_link(struct dentry *dentry,
 	return p;
 }
 
-bool ovl_is_private_xattr(struct super_block *sb, const char *name)
-{
-	struct ovl_fs *ofs = OVL_FS(sb);
-
-	if (ofs->config.userxattr)
-		return strncmp(name, OVL_XATTR_USER_PREFIX,
-			       sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0;
-	else
-		return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
-			       sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0;
-}
-
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
-		  const void *value, size_t size, int flags)
-{
-	int err;
-	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
-	struct dentry *upperdentry = ovl_i_dentry_upper(inode);
-	struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
-	struct path realpath;
-	const struct cred *old_cred;
-
-	err = ovl_want_write(dentry);
-	if (err)
-		goto out;
-
-	if (!value && !upperdentry) {
-		ovl_path_lower(dentry, &realpath);
-		old_cred = ovl_override_creds(dentry->d_sb);
-		err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
-		revert_creds(old_cred);
-		if (err < 0)
-			goto out_drop_write;
-	}
-
-	if (!upperdentry) {
-		err = ovl_copy_up(dentry);
-		if (err)
-			goto out_drop_write;
-
-		realdentry = ovl_dentry_upper(dentry);
-	}
-
-	old_cred = ovl_override_creds(dentry->d_sb);
-	if (value) {
-		err = ovl_do_setxattr(ofs, realdentry, name, value, size,
-				      flags);
-	} else {
-		WARN_ON(flags != XATTR_REPLACE);
-		err = ovl_do_removexattr(ofs, realdentry, name);
-	}
-	revert_creds(old_cred);
-
-	/* copy c/mtime */
-	ovl_copyattr(inode);
-
-out_drop_write:
-	ovl_drop_write(dentry);
-out:
-	return err;
-}
-
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
-		  void *value, size_t size)
-{
-	ssize_t res;
-	const struct cred *old_cred;
-	struct path realpath;
-
-	ovl_i_path_real(inode, &realpath);
-	old_cred = ovl_override_creds(dentry->d_sb);
-	res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
-	revert_creds(old_cred);
-	return res;
-}
-
-static bool ovl_can_list(struct super_block *sb, const char *s)
-{
-	/* Never list private (.overlay) */
-	if (ovl_is_private_xattr(sb, s))
-		return false;
-
-	/* List all non-trusted xattrs */
-	if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
-		return true;
-
-	/* list other trusted for superuser only */
-	return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
-}
-
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
-{
-	struct dentry *realdentry = ovl_dentry_real(dentry);
-	ssize_t res;
-	size_t len;
-	char *s;
-	const struct cred *old_cred;
-
-	old_cred = ovl_override_creds(dentry->d_sb);
-	res = vfs_listxattr(realdentry, list, size);
-	revert_creds(old_cred);
-	if (res <= 0 || size == 0)
-		return res;
-
-	/* filter out private xattrs */
-	for (s = list, len = res; len;) {
-		size_t slen = strnlen(s, len) + 1;
-
-		/* underlying fs providing us with an broken xattr list? */
-		if (WARN_ON(slen > len))
-			return -EIO;
-
-		len -= slen;
-		if (!ovl_can_list(dentry->d_sb, s)) {
-			res -= slen;
-			memmove(s, s + slen, len);
-		} else {
-			s += slen;
-		}
-	}
-
-	return res;
-}
-
 #ifdef CONFIG_FS_POSIX_ACL
 /*
  * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
@@ -611,10 +487,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
 	struct dentry *upperdentry = ovl_dentry_upper(dentry);
 	struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
 
-	err = ovl_want_write(dentry);
-	if (err)
-		return err;
-
 	/*
 	 * If ACL is to be removed from a lower file, check if it exists in
 	 * the first place before copying it up.
@@ -630,7 +502,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
 		revert_creds(old_cred);
 		if (IS_ERR(real_acl)) {
 			err = PTR_ERR(real_acl);
-			goto out_drop_write;
+			goto out;
 		}
 		posix_acl_release(real_acl);
 	}
@@ -638,23 +510,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
 	if (!upperdentry) {
 		err = ovl_copy_up(dentry);
 		if (err)
-			goto out_drop_write;
+			goto out;
 
 		realdentry = ovl_dentry_upper(dentry);
 	}
 
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
 	old_cred = ovl_override_creds(dentry->d_sb);
 	if (acl)
 		err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
 	else
 		err = ovl_do_remove_acl(ofs, realdentry, acl_name);
 	revert_creds(old_cred);
+	ovl_drop_write(dentry);
 
 	/* copy c/mtime */
 	ovl_copyattr(inode);
-
-out_drop_write:
-	ovl_drop_write(dentry);
+out:
 	return err;
 }
 
@@ -704,7 +579,8 @@ int ovl_update_time(struct inode *inode, int flags)
 
 		if (upperpath.dentry) {
 			touch_atime(&upperpath);
-			inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+			inode_set_atime_to_ts(inode,
+					      inode_get_atime(d_inode(upperpath.dentry)));
 		}
 	}
 	return 0;
@@ -777,14 +653,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
 	unsigned int flags;
 	int err;
 
-	err = ovl_want_write(dentry);
-	if (err)
-		goto out;
-
 	err = ovl_copy_up(dentry);
 	if (!err) {
 		ovl_path_real(dentry, &upperpath);
 
+		err = ovl_want_write(dentry);
+		if (err)
+			goto out;
+
 		old_cred = ovl_override_creds(inode->i_sb);
 		/*
 		 * Store immutable/append-only flags in xattr and clear them
@@ -797,6 +673,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
 		if (!err)
 			err = ovl_real_fileattr_set(&upperpath, fa);
 		revert_creds(old_cred);
+		ovl_drop_write(dentry);
 
 		/*
 		 * Merge real inode flags with inode flags read from
@@ -811,7 +688,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
 		/* Update ctime */
 		ovl_copyattr(inode);
 	}
-	ovl_drop_write(dentry);
 out:
 	return err;
 }
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 80391c687c2a..03bc8d5dfa31 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -251,7 +251,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 		err = -EREMOTE;
 		goto out_err;
 	}
-	if (ovl_is_whiteout(this)) {
+
+	path.dentry = this;
+	path.mnt = d->mnt;
+	if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
 		d->stop = d->opaque = true;
 		goto put_and_out;
 	}
@@ -264,8 +267,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
 		goto put_and_out;
 	}
 
-	path.dentry = this;
-	path.mnt = d->mnt;
 	if (!d_can_lookup(this)) {
 		if (d->is_dir || !last_element) {
 			d->stop = true;
@@ -438,7 +439,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
 	else if (IS_ERR(origin))
 		return PTR_ERR(origin);
 
-	if (upperdentry && !ovl_is_whiteout(upperdentry) &&
+	if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) &&
 	    inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
 		goto invalid;
 
@@ -507,6 +508,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
 	return err;
 }
 
+int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
+		      enum ovl_xattr ox, const struct ovl_fh *fh,
+		      bool is_upper, bool set)
+{
+	int err;
+
+	err = ovl_verify_fh(ofs, dentry, ox, fh);
+	if (set && err == -ENODATA)
+		err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+
+	return err;
+}
+
 /*
  * Verify that @real dentry matches the file handle stored in xattr @name.
  *
@@ -515,9 +529,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
  *
  * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
  */
-int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
-		      enum ovl_xattr ox, struct dentry *real, bool is_upper,
-		      bool set)
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+			    enum ovl_xattr ox, struct dentry *real,
+			    bool is_upper, bool set)
 {
 	struct inode *inode;
 	struct ovl_fh *fh;
@@ -530,9 +544,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
 		goto fail;
 	}
 
-	err = ovl_verify_fh(ofs, dentry, ox, fh);
-	if (set && err == -ENODATA)
-		err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+	err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set);
 	if (err)
 		goto fail;
 
@@ -548,6 +560,7 @@ fail:
 	goto out;
 }
 
+
 /* Get upper dentry from index */
 struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
 			       bool connected)
@@ -684,7 +697,7 @@ orphan:
 	goto out;
 }
 
-static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name)
 {
 	char *n, *s;
 
@@ -873,20 +886,27 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
 static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
 			  struct dentry *lower, struct dentry *upper)
 {
+	const struct ovl_fh *fh;
 	int err;
 
 	if (ovl_check_origin_xattr(ofs, upper))
 		return 0;
 
+	fh = ovl_get_origin_fh(ofs, lower);
+	if (IS_ERR(fh))
+		return PTR_ERR(fh);
+
 	err = ovl_want_write(dentry);
 	if (err)
-		return err;
+		goto out;
 
-	err = ovl_set_origin(ofs, lower, upper);
+	err = ovl_set_origin_fh(ofs, fh, upper);
 	if (!err)
 		err = ovl_set_impure(dentry->d_parent, upper->d_parent);
 
 	ovl_drop_write(dentry);
+out:
+	kfree(fh);
 	return err;
 }
 
@@ -1383,7 +1403,11 @@ bool ovl_lower_positive(struct dentry *dentry)
 				break;
 			}
 		} else {
-			positive = !ovl_is_whiteout(this);
+			struct path path = {
+				.dentry = this,
+				.mnt = parentpath->layer->mnt,
+			};
+			positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
 			done = true;
 			dput(this);
 		}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 9817b2dcb132..05c3dd597fa8 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -28,7 +28,16 @@ enum ovl_path_type {
 
 #define OVL_XATTR_NAMESPACE "overlay."
 #define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1)
 #define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1)
+
+#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1)
+#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX
+#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1)
 
 enum ovl_xattr {
 	OVL_XATTR_OPAQUE,
@@ -40,6 +49,8 @@ enum ovl_xattr {
 	OVL_XATTR_UUID,
 	OVL_XATTR_METACOPY,
 	OVL_XATTR_PROTATTR,
+	OVL_XATTR_XWHITEOUT,
+	OVL_XATTR_XWHITEOUTS,
 };
 
 enum ovl_inode_flag {
@@ -397,7 +408,19 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
 	return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
 }
 
+static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
+				 u32 request_mask, unsigned int flags)
+{
+	if (flags & AT_GETATTR_NOSEC)
+		return vfs_getattr_nosec(path, stat, request_mask, flags);
+	return vfs_getattr(path, stat, request_mask, flags);
+}
+
 /* util.c */
+int ovl_get_write_access(struct dentry *dentry);
+void ovl_put_write_access(struct dentry *dentry);
+void ovl_start_write(struct dentry *dentry);
+void ovl_end_write(struct dentry *dentry);
 int ovl_want_write(struct dentry *dentry);
 void ovl_drop_write(struct dentry *dentry);
 struct dentry *ovl_workdir(struct dentry *dentry);
@@ -460,6 +483,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
 void ovl_dir_modified(struct dentry *dentry, bool impurity);
 u64 ovl_inode_version_get(struct inode *inode);
 bool ovl_is_whiteout(struct dentry *dentry);
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path);
 struct file *ovl_path_open(const struct path *path, int flags);
 int ovl_copy_up_start(struct dentry *dentry, int flags);
 void ovl_copy_up_end(struct dentry *dentry);
@@ -467,9 +491,21 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags);
 bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
 			      enum ovl_xattr ox);
 bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
 bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
 			 const struct path *upperpath);
 
+static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs,
+					 struct dentry *upperdentry)
+{
+	struct path upperpath = {
+		.dentry = upperdentry,
+		.mnt = ovl_upper_mnt(ofs),
+	};
+	return ovl_path_is_whiteout(ofs, &upperpath);
+}
+
 static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
 					  struct dentry *upperdentry)
 {
@@ -624,11 +660,15 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
 int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
 			struct dentry *upperdentry, struct ovl_path **stackp);
 int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
-		      enum ovl_xattr ox, struct dentry *real, bool is_upper,
-		      bool set);
+		      enum ovl_xattr ox, const struct ovl_fh *fh,
+		      bool is_upper, bool set);
+int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
+			    enum ovl_xattr ox, struct dentry *real,
+			    bool is_upper, bool set);
 struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
 			       bool connected);
 int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
+int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name);
 int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
 		       struct qstr *name);
 struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
@@ -640,17 +680,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags);
 bool ovl_lower_positive(struct dentry *dentry);
 
+static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper,
+				       const struct ovl_fh *fh, bool set)
+{
+	return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set);
+}
+
 static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
 				    struct dentry *origin, bool set)
 {
-	return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin,
-				 false, set);
+	return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin,
+				       false, set);
 }
 
 static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
 				   struct dentry *upper, bool set)
 {
-	return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set);
+	return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper,
+				       true, set);
 }
 
 /* readdir.c */
@@ -684,17 +731,8 @@ int ovl_set_nlink_lower(struct dentry *dentry);
 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
 			   struct dentry *upperdentry,
 			   unsigned int fallback);
-int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		struct iattr *attr);
-int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
-		struct kstat *stat, u32 request_mask, unsigned int flags);
 int ovl_permission(struct mnt_idmap *idmap, struct inode *inode,
 		   int mask);
-int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
-		  const void *value, size_t size, int flags);
-int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
-		  void *value, size_t size);
-ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 
 #ifdef CONFIG_FS_POSIX_ACL
 struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
@@ -815,8 +853,9 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr
 int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
 struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
 				  bool is_upper);
-int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
-		   struct dentry *upper);
+struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
+int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
+		      struct dentry *upper);
 
 /* export.c */
 extern const struct export_operations ovl_export_operations;
@@ -830,3 +869,12 @@ static inline bool ovl_force_readonly(struct ovl_fs *ofs)
 {
 	return (!ovl_upper_mnt(ofs) || !ofs->workdir);
 }
+
+/* xattr.c */
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs);
+int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+		struct iattr *attr);
+int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
+		struct kstat *stat, u32 request_mask, unsigned int flags);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index f6ff23fd101c..3fe2dde1598f 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -43,8 +43,10 @@ module_param_named(metacopy, ovl_metacopy_def, bool, 0644);
 MODULE_PARM_DESC(metacopy,
 		 "Default to on or off for the metadata only copy up feature");
 
-enum {
+enum ovl_opt {
 	Opt_lowerdir,
+	Opt_lowerdir_add,
+	Opt_datadir_add,
 	Opt_upperdir,
 	Opt_workdir,
 	Opt_default_permissions,
@@ -140,8 +142,11 @@ static int ovl_verity_mode_def(void)
 #define fsparam_string_empty(NAME, OPT) \
 	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
 
+
 const struct fs_parameter_spec ovl_parameter_spec[] = {
 	fsparam_string_empty("lowerdir",    Opt_lowerdir),
+	fsparam_string("lowerdir+",         Opt_lowerdir_add),
+	fsparam_string("datadir+",          Opt_datadir_add),
 	fsparam_string("upperdir",          Opt_upperdir),
 	fsparam_string("workdir",           Opt_workdir),
 	fsparam_flag("default_permissions", Opt_default_permissions),
@@ -238,19 +243,8 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
 		pr_err("failed to resolve '%s': %i\n", name, err);
 		goto out;
 	}
-	err = -EINVAL;
-	if (ovl_dentry_weird(path->dentry)) {
-		pr_err("filesystem on '%s' not supported\n", name);
-		goto out_put;
-	}
-	if (!d_is_dir(path->dentry)) {
-		pr_err("'%s' not a directory\n", name);
-		goto out_put;
-	}
 	return 0;
 
-out_put:
-	path_put_init(path);
 out:
 	return err;
 }
@@ -268,7 +262,7 @@ static void ovl_unescape(char *s)
 	}
 }
 
-static int ovl_mount_dir(const char *name, struct path *path, bool upper)
+static int ovl_mount_dir(const char *name, struct path *path)
 {
 	int err = -ENOMEM;
 	char *tmp = kstrdup(name, GFP_KERNEL);
@@ -276,68 +270,147 @@ static int ovl_mount_dir(const char *name, struct path *path, bool upper)
 	if (tmp) {
 		ovl_unescape(tmp);
 		err = ovl_mount_dir_noesc(tmp, path);
-
-		if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) {
-			pr_err("filesystem on '%s' not supported as upperdir\n",
-			       tmp);
-			path_put_init(path);
-			err = -EINVAL;
-		}
 		kfree(tmp);
 	}
 	return err;
 }
 
-static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc,
-				    bool workdir)
+static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
+			       enum ovl_opt layer, const char *name, bool upper)
 {
-	int err;
-	struct ovl_fs *ofs = fc->s_fs_info;
-	struct ovl_config *config = &ofs->config;
 	struct ovl_fs_context *ctx = fc->fs_private;
-	struct path path;
-	char *dup;
 
-	err = ovl_mount_dir(name, &path, true);
-	if (err)
-		return err;
+	if (ovl_dentry_weird(path->dentry))
+		return invalfc(fc, "filesystem on %s not supported", name);
+
+	if (!d_is_dir(path->dentry))
+		return invalfc(fc, "%s is not a directory", name);
+
 
 	/*
 	 * Check whether upper path is read-only here to report failures
 	 * early. Don't forget to recheck when the superblock is created
 	 * as the mount attributes could change.
 	 */
-	if (__mnt_is_readonly(path.mnt)) {
-		path_put(&path);
-		return -EINVAL;
+	if (upper) {
+		if (path->dentry->d_flags & DCACHE_OP_REAL)
+			return invalfc(fc, "filesystem on %s not supported as upperdir", name);
+		if (__mnt_is_readonly(path->mnt))
+			return invalfc(fc, "filesystem on %s is read-only", name);
+	} else {
+		if (ctx->lowerdir_all && layer != Opt_lowerdir)
+			return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir");
+		if (ctx->nr_data && layer == Opt_lowerdir_add)
+			return invalfc(fc, "regular lower layers cannot follow data layers");
+		if (ctx->nr == OVL_MAX_STACK)
+			return invalfc(fc, "too many lower directories, limit is %d",
+				       OVL_MAX_STACK);
 	}
+	return 0;
+}
 
-	dup = kstrdup(name, GFP_KERNEL);
-	if (!dup) {
-		path_put(&path);
+static int ovl_ctx_realloc_lower(struct fs_context *fc)
+{
+	struct ovl_fs_context *ctx = fc->fs_private;
+	struct ovl_fs_context_layer *l;
+	size_t nr;
+
+	if (ctx->nr < ctx->capacity)
+		return 0;
+
+	nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2),
+		   OVL_MAX_STACK);
+	l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT);
+	if (!l)
 		return -ENOMEM;
+
+	ctx->lower = l;
+	ctx->capacity = nr;
+	return 0;
+}
+
+static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
+			 struct path *path, char **pname)
+{
+	struct ovl_fs *ofs = fc->s_fs_info;
+	struct ovl_config *config = &ofs->config;
+	struct ovl_fs_context *ctx = fc->fs_private;
+	struct ovl_fs_context_layer *l;
+
+	switch (layer) {
+	case Opt_workdir:
+		swap(config->workdir, *pname);
+		swap(ctx->work, *path);
+		break;
+	case Opt_upperdir:
+		swap(config->upperdir, *pname);
+		swap(ctx->upper, *path);
+		break;
+	case Opt_datadir_add:
+		ctx->nr_data++;
+		fallthrough;
+	case Opt_lowerdir_add:
+		WARN_ON(ctx->nr >= ctx->capacity);
+		l = &ctx->lower[ctx->nr++];
+		memset(l, 0, sizeof(*l));
+		swap(l->name, *pname);
+		swap(l->path, *path);
+		break;
+	default:
+		WARN_ON(1);
 	}
+}
 
-	if (workdir) {
-		kfree(config->workdir);
-		config->workdir = dup;
-		path_put(&ctx->work);
-		ctx->work = path;
-	} else {
-		kfree(config->upperdir);
-		config->upperdir = dup;
-		path_put(&ctx->upper);
-		ctx->upper = path;
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+			   enum ovl_opt layer)
+{
+	char *name = kstrdup(param->string, GFP_KERNEL);
+	bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
+	struct path path;
+	int err;
+
+	if (!name)
+		return -ENOMEM;
+
+	if (upper)
+		err = ovl_mount_dir(name, &path);
+	else
+		err = ovl_mount_dir_noesc(name, &path);
+	if (err)
+		goto out_free;
+
+	err = ovl_mount_dir_check(fc, &path, layer, name, upper);
+	if (err)
+		goto out_put;
+
+	if (!upper) {
+		err = ovl_ctx_realloc_lower(fc);
+		if (err)
+			goto out_put;
 	}
-	return 0;
+
+	/* Store the user provided path string in ctx to show in mountinfo */
+	ovl_add_layer(fc, layer, &path, &name);
+
+out_put:
+	path_put(&path);
+out_free:
+	kfree(name);
+	return err;
 }
 
-static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
+static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx)
 {
-	for (size_t nr = 0; nr < ctx->nr; nr++) {
-		path_put(&ctx->lower[nr].path);
-		kfree(ctx->lower[nr].name);
-		ctx->lower[nr].name = NULL;
+	struct ovl_fs_context_layer *l = ctx->lower;
+
+	// Reset old user provided lowerdir string
+	kfree(ctx->lowerdir_all);
+	ctx->lowerdir_all = NULL;
+
+	for (size_t nr = 0; nr < ctx->nr; nr++, l++) {
+		path_put(&l->path);
+		kfree(l->name);
+		l->name = NULL;
 	}
 	ctx->nr = 0;
 	ctx->nr_data = 0;
@@ -346,7 +419,7 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
 /*
  * Parse lowerdir= mount option:
  *
- * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2
+ * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2
  *     Set "/lower1", "/lower2", and "/lower3" as lower layers and
  *     "/data1" and "/data2" as data lower layers. Any existing lower
  *     layers are replaced.
@@ -356,9 +429,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	int err;
 	struct ovl_fs_context *ctx = fc->fs_private;
 	struct ovl_fs_context_layer *l;
-	char *dup = NULL, *dup_iter;
-	ssize_t nr_lower = 0, nr = 0, nr_data = 0;
-	bool append = false, data_layer = false;
+	char *dup = NULL, *iter;
+	ssize_t nr_lower, nr;
+	bool data_layer = false;
 
 	/*
 	 * Ensure we're backwards compatible with mount(2)
@@ -366,16 +439,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	 */
 
 	/* drop all existing lower layers */
-	if (!*name) {
-		ovl_parse_param_drop_lowerdir(ctx);
+	ovl_reset_lowerdirs(ctx);
+
+	if (!*name)
 		return 0;
-	}
 
 	if (*name == ':') {
 		pr_err("cannot append lower layer");
 		return -EINVAL;
 	}
 
+	// Store user provided lowerdir string to show in mount options
+	ctx->lowerdir_all = kstrdup(name, GFP_KERNEL);
+	if (!ctx->lowerdir_all)
+		return -ENOMEM;
+
 	dup = kstrdup(name, GFP_KERNEL);
 	if (!dup)
 		return -ENOMEM;
@@ -385,36 +463,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	if (nr_lower < 0)
 		goto out_err;
 
-	if ((nr_lower > OVL_MAX_STACK) ||
-	    (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) {
+	if (nr_lower > OVL_MAX_STACK) {
 		pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK);
 		goto out_err;
 	}
 
-	if (!append)
-		ovl_parse_param_drop_lowerdir(ctx);
-
-	/*
-	 * (1) append
-	 *
-	 * We want nr <= nr_lower <= capacity We know nr > 0 and nr <=
-	 * capacity. If nr == 0 this wouldn't be append. If nr +
-	 * nr_lower is <= capacity then nr <= nr_lower <= capacity
-	 * already holds. If nr + nr_lower exceeds capacity, we realloc.
-	 *
-	 * (2) replace
-	 *
-	 * Ensure we're backwards compatible with mount(2) which allows
-	 * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last
-	 * specified lowerdir mount option to win.
-	 *
-	 * We want nr <= nr_lower <= capacity We know either (i) nr == 0
-	 * or (ii) nr > 0. We also know nr_lower > 0. The capacity
-	 * could've been changed multiple times already so we only know
-	 * nr <= capacity. If nr + nr_lower > capacity we realloc,
-	 * otherwise nr <= nr_lower <= capacity holds already.
-	 */
-	nr_lower += ctx->nr;
 	if (nr_lower > ctx->capacity) {
 		err = -ENOMEM;
 		l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower),
@@ -426,59 +479,40 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 		ctx->capacity = nr_lower;
 	}
 
-	/*
-	 *   (3) By (1) and (2) we know nr <= nr_lower <= capacity.
-	 *   (4) If ctx->nr == 0 => replace
-	 *       We have verified above that the lowerdir mount option
-	 *       isn't an append, i.e., the lowerdir mount option
-	 *       doesn't start with ":" or "::".
-	 * (4.1) The lowerdir mount options only contains regular lower
-	 *       layers ":".
-	 *       => Nothing to verify.
-	 * (4.2) The lowerdir mount options contains regular ":" and
-	 *       data "::" layers.
-	 *       => We need to verify that data lower layers "::" aren't
-	 *          followed by regular ":" lower layers
-	 *   (5) If ctx->nr > 0 => append
-	 *       We know that there's at least one regular layer
-	 *       otherwise we would've failed when parsing the previous
-	 *       lowerdir mount option.
-	 * (5.1) The lowerdir mount option is a regular layer ":" append
-	 *       => We need to verify that no data layers have been
-	 *          specified before.
-	 * (5.2) The lowerdir mount option is a data layer "::" append
-	 *       We know that there's at least one regular layer or
-	 *       other data layers. => There's nothing to verify.
-	 */
-	dup_iter = dup;
-	for (nr = ctx->nr; nr < nr_lower; nr++) {
-		l = &ctx->lower[nr];
+	iter = dup;
+	l = ctx->lower;
+	for (nr = 0; nr < nr_lower; nr++, l++) {
+		ctx->nr++;
 		memset(l, 0, sizeof(*l));
 
-		err = ovl_mount_dir(dup_iter, &l->path, false);
+		err = ovl_mount_dir(iter, &l->path);
+		if (err)
+			goto out_put;
+
+		err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false);
 		if (err)
 			goto out_put;
 
 		err = -ENOMEM;
-		l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT);
+		l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT);
 		if (!l->name)
 			goto out_put;
 
 		if (data_layer)
-			nr_data++;
+			ctx->nr_data++;
 
 		/* Calling strchr() again would overrun. */
-		if ((nr + 1) == nr_lower)
+		if (ctx->nr == nr_lower)
 			break;
 
 		err = -EINVAL;
-		dup_iter = strchr(dup_iter, '\0') + 1;
-		if (*dup_iter) {
+		iter = strchr(iter, '\0') + 1;
+		if (*iter) {
 			/*
 			 * This is a regular layer so we require that
 			 * there are no data layers.
 			 */
-			if ((ctx->nr_data + nr_data) > 0) {
+			if (ctx->nr_data > 0) {
 				pr_err("regular lower layers cannot follow data lower layers");
 				goto out_put;
 			}
@@ -489,29 +523,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 
 		/* This is a data lower layer. */
 		data_layer = true;
-		dup_iter++;
+		iter++;
 	}
-	ctx->nr = nr_lower;
-	ctx->nr_data += nr_data;
 	kfree(dup);
 	return 0;
 
 out_put:
-	/*
-	 * We know nr >= ctx->nr < nr_lower. If we failed somewhere
-	 * we want to undo until nr == ctx->nr. This is correct for
-	 * both ctx->nr == 0 and ctx->nr > 0.
-	 */
-	for (; nr >= ctx->nr; nr--) {
-		l = &ctx->lower[nr];
-		kfree(l->name);
-		l->name = NULL;
-		path_put(&l->path);
-
-		/* don't overflow */
-		if (nr == 0)
-			break;
-	}
+	ovl_reset_lowerdirs(ctx);
 
 out_err:
 	kfree(dup);
@@ -556,11 +574,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case Opt_lowerdir:
 		err = ovl_parse_param_lowerdir(param->string, fc);
 		break;
+	case Opt_lowerdir_add:
+	case Opt_datadir_add:
 	case Opt_upperdir:
-		fallthrough;
 	case Opt_workdir:
-		err = ovl_parse_param_upperdir(param->string, fc,
-					       (Opt_workdir == opt));
+		err = ovl_parse_layer(fc, param, opt);
 		break;
 	case Opt_default_permissions:
 		config->default_permissions = true;
@@ -617,7 +635,7 @@ static int ovl_get_tree(struct fs_context *fc)
 
 static inline void ovl_fs_context_free(struct ovl_fs_context *ctx)
 {
-	ovl_parse_param_drop_lowerdir(ctx);
+	ovl_reset_lowerdirs(ctx);
 	path_put(&ctx->upper);
 	path_put(&ctx->work);
 	kfree(ctx->lower);
@@ -933,23 +951,28 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ovl_fs *ofs = OVL_FS(sb);
-	size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
+	size_t nr, nr_merged_lower, nr_lower = 0;
+	char **lowerdirs = ofs->config.lowerdirs;
 
 	/*
-	 * lowerdirs[] starts from offset 1, then
-	 * >= 0 regular lower layers prefixed with : and
-	 * >= 0 data-only lower layers prefixed with ::
-	 *
-	 * we need to escase comma and space like seq_show_option() does and
-	 * we also need to escape the colon separator from lowerdir paths.
+	 * lowerdirs[0] holds the colon separated list that user provided
+	 * with lowerdir mount option.
+	 * lowerdirs[1..numlayer] hold the lowerdir paths that were added
+	 * using the lowerdir+ and datadir+ mount options.
+	 * For now, we do not allow mixing the legacy lowerdir mount option
+	 * with the new lowerdir+ and datadir+ mount options.
 	 */
-	seq_puts(m, ",lowerdir=");
-	for (nr = 1; nr < ofs->numlayer; nr++) {
-		if (nr > 1)
-			seq_putc(m, ':');
-		if (nr >= nr_merged_lower)
-			seq_putc(m, ':');
-		seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\");
+	if (lowerdirs[0]) {
+		seq_show_option(m, "lowerdir", lowerdirs[0]);
+	} else {
+		nr_lower = ofs->numlayer;
+		nr_merged_lower = nr_lower - ofs->numdatalayer;
+	}
+	for (nr = 1; nr < nr_lower; nr++) {
+		if (nr < nr_merged_lower)
+			seq_show_option(m, "lowerdir+", lowerdirs[nr]);
+		else
+			seq_show_option(m, "datadir+", lowerdirs[nr]);
 	}
 	if (ofs->config.upperdir) {
 		seq_show_option(m, "upperdir", ofs->config.upperdir);
diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h
index 8750da68ab2a..c96d93982021 100644
--- a/fs/overlayfs/params.h
+++ b/fs/overlayfs/params.h
@@ -32,6 +32,7 @@ struct ovl_fs_context {
 	size_t nr_data;
 	struct ovl_opt_set set;
 	struct ovl_fs_context_layer *lower;
+	char *lowerdir_all; /* user provided lowerdir string */
 };
 
 int ovl_init_fs_context(struct fs_context *fc);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index de39e067ae65..a490fc47c3e7 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -25,6 +25,7 @@ struct ovl_cache_entry {
 	struct ovl_cache_entry *next_maybe_whiteout;
 	bool is_upper;
 	bool is_whiteout;
+	bool check_xwhiteout;
 	char name[];
 };
 
@@ -47,6 +48,7 @@ struct ovl_readdir_data {
 	int err;
 	bool is_upper;
 	bool d_type_supported;
+	bool in_xwhiteouts_dir;
 };
 
 struct ovl_dir_file {
@@ -162,6 +164,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
 		p->ino = 0;
 	p->is_upper = rdd->is_upper;
 	p->is_whiteout = false;
+	/* Defer check for overlay.whiteout to ovl_iterate() */
+	p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;
 
 	if (d_type == DT_CHR) {
 		p->next_maybe_whiteout = rdd->first_maybe_whiteout;
@@ -301,6 +305,8 @@ static inline int ovl_dir_read(const struct path *realpath,
 	if (IS_ERR(realfile))
 		return PTR_ERR(realfile);
 
+	rdd->in_xwhiteouts_dir = rdd->dentry &&
+		ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
 	rdd->first_maybe_whiteout = NULL;
 	rdd->ctx.pos = 0;
 	do {
@@ -447,7 +453,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
 }
 
 /*
- * Set d_ino for upper entries. Non-upper entries should always report
+ * Set d_ino for upper entries if needed. Non-upper entries should always report
  * the uppermost real inode ino and should not call this function.
  *
  * When not all layer are on same fs, report real ino also for upper.
@@ -455,8 +461,11 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
  * When all layers are on the same fs, and upper has a reference to
  * copy up origin, call vfs_getattr() on the overlay entry to make
  * sure that d_ino will be consistent with st_ino from stat(2).
+ *
+ * Also checks the overlay.whiteout xattr by doing a full lookup which will return
+ * negative in this case.
  */
-static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)
 
 {
 	struct dentry *dir = path->dentry;
@@ -467,7 +476,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
 	int xinobits = ovl_xino_bits(ofs);
 	int err = 0;
 
-	if (!ovl_same_dev(ofs))
+	if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
 		goto out;
 
 	if (p->name[0] == '.') {
@@ -481,6 +490,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
 			goto get;
 		}
 	}
+	/* This checks also for xwhiteouts */
 	this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
 	if (IS_ERR_OR_NULL(this) || !this->d_inode) {
 		/* Mark a stale entry */
@@ -494,6 +504,9 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry
 	}
 
 get:
+	if (!ovl_same_dev(ofs) || !update_ino)
+		goto out;
+
 	type = ovl_path_type(this);
 	if (OVL_TYPE_ORIGIN(type)) {
 		struct kstat stat;
@@ -572,7 +585,7 @@ static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
 	list_for_each_entry_safe(p, n, list, l_node) {
 		if (strcmp(p->name, ".") != 0 &&
 		    strcmp(p->name, "..") != 0) {
-			err = ovl_cache_update_ino(path, p);
+			err = ovl_cache_update(path, p, true);
 			if (err)
 				return err;
 		}
@@ -778,13 +791,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
 	while (od->cursor != &od->cache->entries) {
 		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
 		if (!p->is_whiteout) {
-			if (!p->ino) {
-				err = ovl_cache_update_ino(&file->f_path, p);
+			if (!p->ino || p->check_xwhiteout) {
+				err = ovl_cache_update(&file->f_path, p, !p->ino);
 				if (err)
 					goto out;
 			}
 		}
-		/* ovl_cache_update_ino() sets is_whiteout on stale entry */
+		/* ovl_cache_update() sets is_whiteout on stale entry */
 		if (!p->is_whiteout) {
 			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
 				break;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3fa2416264a4..a0967bb25003 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	struct dentry *real = NULL, *lower;
 	int err;
 
-	/* It's an overlay file */
+	/*
+	 * vfs is only expected to call d_real() with NULL from d_real_inode()
+	 * and with overlay inode from file_dentry() on an overlay file.
+	 *
+	 * TODO: remove @inode argument from d_real() API, remove code in this
+	 * function that deals with non-NULL @inode and remove d_real() call
+	 * from file_dentry().
+	 */
 	if (inode && d_inode(dentry) == inode)
 		return dentry;
+	else if (inode)
+		goto bug;
 
 	if (!d_is_reg(dentry)) {
-		if (!inode || inode == d_inode(dentry))
-			return dentry;
-		goto bug;
+		/* d_real_inode() is only relevant for regular files */
+		return dentry;
 	}
 
 	real = ovl_dentry_upper(dentry);
@@ -437,68 +445,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
 	return ok;
 }
 
-static int ovl_own_xattr_get(const struct xattr_handler *handler,
-			     struct dentry *dentry, struct inode *inode,
-			     const char *name, void *buffer, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-
-static int ovl_own_xattr_set(const struct xattr_handler *handler,
-			     struct mnt_idmap *idmap,
-			     struct dentry *dentry, struct inode *inode,
-			     const char *name, const void *value,
-			     size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static int ovl_other_xattr_get(const struct xattr_handler *handler,
-			       struct dentry *dentry, struct inode *inode,
-			       const char *name, void *buffer, size_t size)
-{
-	return ovl_xattr_get(dentry, inode, name, buffer, size);
-}
-
-static int ovl_other_xattr_set(const struct xattr_handler *handler,
-			       struct mnt_idmap *idmap,
-			       struct dentry *dentry, struct inode *inode,
-			       const char *name, const void *value,
-			       size_t size, int flags)
-{
-	return ovl_xattr_set(dentry, inode, name, value, size, flags);
-}
-
-static const struct xattr_handler ovl_own_trusted_xattr_handler = {
-	.prefix	= OVL_XATTR_TRUSTED_PREFIX,
-	.get = ovl_own_xattr_get,
-	.set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_own_user_xattr_handler = {
-	.prefix	= OVL_XATTR_USER_PREFIX,
-	.get = ovl_own_xattr_get,
-	.set = ovl_own_xattr_set,
-};
-
-static const struct xattr_handler ovl_other_xattr_handler = {
-	.prefix	= "", /* catch all */
-	.get = ovl_other_xattr_get,
-	.set = ovl_other_xattr_set,
-};
-
-static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
-	&ovl_own_trusted_xattr_handler,
-	&ovl_other_xattr_handler,
-	NULL
-};
-
-static const struct xattr_handler *ovl_user_xattr_handlers[] = {
-	&ovl_own_user_xattr_handler,
-	&ovl_other_xattr_handler,
-	NULL
-};
-
 static int ovl_setup_trap(struct super_block *sb, struct dentry *dir,
 			  struct inode **ptrap, const char *name)
 {
@@ -639,7 +585,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
 	if (IS_ERR(whiteout))
 		goto cleanup_temp;
 
-	err = ovl_is_whiteout(whiteout);
+	err = ovl_upper_is_whiteout(ofs, whiteout);
 
 	/* Best effort cleanup of whiteout and temp file */
 	if (err)
@@ -879,15 +825,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
 {
 	struct vfsmount *mnt = ovl_upper_mnt(ofs);
 	struct dentry *indexdir;
+	struct dentry *origin = ovl_lowerstack(oe)->dentry;
+	const struct ovl_fh *fh;
 	int err;
 
+	fh = ovl_get_origin_fh(ofs, origin);
+	if (IS_ERR(fh))
+		return PTR_ERR(fh);
+
 	err = mnt_want_write(mnt);
 	if (err)
-		return err;
+		goto out_free_fh;
 
 	/* Verify lower root is upper root origin */
-	err = ovl_verify_origin(ofs, upperpath->dentry,
-				ovl_lowerstack(oe)->dentry, true);
+	err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true);
 	if (err) {
 		pr_err("failed to verify upper root origin\n");
 		goto out;
@@ -919,9 +870,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
 		 * directory entries.
 		 */
 		if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
-			err = ovl_verify_set_fh(ofs, ofs->indexdir,
-						OVL_XATTR_ORIGIN,
-						upperpath->dentry, true, false);
+			err = ovl_verify_origin_xattr(ofs, ofs->indexdir,
+						      OVL_XATTR_ORIGIN,
+						      upperpath->dentry, true,
+						      false);
 			if (err)
 				pr_err("failed to verify index dir 'origin' xattr\n");
 		}
@@ -939,6 +891,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
 
 out:
 	mnt_drop_write(mnt);
+out_free_fh:
+	kfree(fh);
 	return err;
 }
 
@@ -1374,8 +1328,11 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	ofs->layers = layers;
 	/*
 	 * Layer 0 is reserved for upper even if there's no upper.
-	 * For consistency, config.lowerdirs[0] is NULL.
+	 * config.lowerdirs[0] is used for storing the user provided colon
+	 * separated lowerdir string.
 	 */
+	ofs->config.lowerdirs[0] = ctx->lowerdir_all;
+	ctx->lowerdir_all = NULL;
 	ofs->numlayer = 1;
 
 	sb->s_stack_depth = 0;
@@ -1485,11 +1442,18 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
 
 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
-	sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
-		ovl_trusted_xattr_handlers;
+	sb->s_xattr = ovl_xattr_handlers(ofs);
 	sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
 	sb->s_flags |= SB_POSIXACL;
-	sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+#endif
+	sb->s_iflags |= SB_I_SKIP_SYNC;
+	/*
+	 * Ensure that umask handling is done by the filesystems used
+	 * for the the upper layer instead of overlayfs as that would
+	 * lead to unexpected results.
+	 */
+	sb->s_iflags |= SB_I_NOUMASK;
 
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 89e0d60d35b6..c3f020ca13a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,12 +17,38 @@
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
 
+/* Get write access to upper mnt - may fail if upper sb was remounted ro */
+int ovl_get_write_access(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	return mnt_get_write_access(ovl_upper_mnt(ofs));
+}
+
+/* Get write access to upper sb - may block if upper sb is frozen */
+void ovl_start_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	sb_start_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
 int ovl_want_write(struct dentry *dentry)
 {
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
 	return mnt_want_write(ovl_upper_mnt(ofs));
 }
 
+void ovl_put_write_access(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	mnt_put_write_access(ovl_upper_mnt(ofs));
+}
+
+void ovl_end_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	sb_end_write(ovl_upper_mnt(ofs)->mnt_sb);
+}
+
 void ovl_drop_write(struct dentry *dentry)
 {
 	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -55,7 +81,7 @@ int ovl_can_decode_fh(struct super_block *sb)
 	if (!capable(CAP_DAC_READ_SEARCH))
 		return 0;
 
-	if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
+	if (!exportfs_can_decode_fh(sb->s_export_op))
 		return 0;
 
 	return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
@@ -575,6 +601,16 @@ bool ovl_is_whiteout(struct dentry *dentry)
 	return inode && IS_WHITEOUT(inode);
 }
 
+/*
+ * Use this over ovl_is_whiteout for upper and lower files, as it also
+ * handles overlay.whiteout xattr whiteout files.
+ */
+bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path)
+{
+	return ovl_is_whiteout(path->dentry) ||
+		ovl_path_check_xwhiteout_xattr(ofs, path);
+}
+
 struct file *ovl_path_open(const struct path *path, int flags)
 {
 	struct inode *inode = d_inode(path->dentry);
@@ -644,22 +680,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags)
 	return false;
 }
 
+/*
+ * The copy up "transaction" keeps an elevated mnt write count on upper mnt,
+ * but leaves taking freeze protection on upper sb to lower level helpers.
+ */
 int ovl_copy_up_start(struct dentry *dentry, int flags)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
 
 	err = ovl_inode_lock_interruptible(inode);
-	if (!err && ovl_already_copied_up_locked(dentry, flags)) {
+	if (err)
+		return err;
+
+	if (ovl_already_copied_up_locked(dentry, flags))
 		err = 1; /* Already copied up */
-		ovl_inode_unlock(inode);
-	}
+	else
+		err = ovl_get_write_access(dentry);
+	if (err)
+		goto out_unlock;
+
+	return 0;
 
+out_unlock:
+	ovl_inode_unlock(inode);
 	return err;
 }
 
 void ovl_copy_up_end(struct dentry *dentry)
 {
+	ovl_put_write_access(dentry);
 	ovl_inode_unlock(d_inode(dentry));
 }
 
@@ -676,6 +726,32 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
 	return false;
 }
 
+bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	int res;
+
+	/* xattr.whiteout must be a zero size regular file */
+	if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0)
+		return false;
+
+	res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0);
+	return res >= 0;
+}
+
+bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
+{
+	struct dentry *dentry = path->dentry;
+	int res;
+
+	/* xattr.whiteouts must be a directory */
+	if (!d_is_dir(dentry))
+		return false;
+
+	res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
+	return res >= 0;
+}
+
 /*
  * Load persistent uuid from xattr into s_uuid if found, or store a new
  * random generated value in s_uuid and in xattr.
@@ -760,6 +836,8 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
 #define OVL_XATTR_UUID_POSTFIX		"uuid"
 #define OVL_XATTR_METACOPY_POSTFIX	"metacopy"
 #define OVL_XATTR_PROTATTR_POSTFIX	"protattr"
+#define OVL_XATTR_XWHITEOUT_POSTFIX	"whiteout"
+#define OVL_XATTR_XWHITEOUTS_POSTFIX	"whiteouts"
 
 #define OVL_XATTR_TAB_ENTRY(x) \
 	[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -775,6 +853,8 @@ const char *const ovl_xattr_table[][2] = {
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID),
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
 	OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
+	OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
+	OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
 };
 
 int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -898,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
 	return 0;
 }
 
-/**
+/*
  * Caller must hold a reference to inode to prevent it from being freed while
  * it is marked inuse.
  */
@@ -973,12 +1053,18 @@ static void ovl_cleanup_index(struct dentry *dentry)
 	struct dentry *index = NULL;
 	struct inode *inode;
 	struct qstr name = { };
+	bool got_write = false;
 	int err;
 
 	err = ovl_get_index_name(ofs, lowerdentry, &name);
 	if (err)
 		goto fail;
 
+	err = ovl_want_write(dentry);
+	if (err)
+		goto fail;
+
+	got_write = true;
 	inode = d_inode(upperdentry);
 	if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
 		pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
@@ -1016,6 +1102,8 @@ static void ovl_cleanup_index(struct dentry *dentry)
 		goto fail;
 
 out:
+	if (got_write)
+		ovl_drop_write(dentry);
 	kfree(name.name);
 	dput(index);
 	return;
@@ -1062,8 +1150,12 @@ int ovl_nlink_start(struct dentry *dentry)
 	if (err)
 		return err;
 
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out_unlock;
+
 	if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
-		goto out;
+		return 0;
 
 	old_cred = ovl_override_creds(dentry->d_sb);
 	/*
@@ -1074,10 +1166,15 @@ int ovl_nlink_start(struct dentry *dentry)
 	 */
 	err = ovl_set_nlink_upper(dentry);
 	revert_creds(old_cred);
-
-out:
 	if (err)
-		ovl_inode_unlock(inode);
+		goto out_drop_write;
+
+	return 0;
+
+out_drop_write:
+	ovl_drop_write(dentry);
+out_unlock:
+	ovl_inode_unlock(inode);
 
 	return err;
 }
@@ -1086,6 +1183,8 @@ void ovl_nlink_end(struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
 
+	ovl_drop_write(dentry);
+
 	if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
 		const struct cred *old_cred;
 
@@ -1403,14 +1502,16 @@ void ovl_copyattr(struct inode *inode)
 	realinode = ovl_i_path_real(inode, &realpath);
 	real_idmap = mnt_idmap(realpath.mnt);
 
+	spin_lock(&inode->i_lock);
 	vfsuid = i_uid_into_vfsuid(real_idmap, realinode);
 	vfsgid = i_gid_into_vfsgid(real_idmap, realinode);
 
 	inode->i_uid = vfsuid_into_kuid(vfsuid);
 	inode->i_gid = vfsgid_into_kgid(vfsgid);
 	inode->i_mode = realinode->i_mode;
-	inode->i_atime = realinode->i_atime;
-	inode->i_mtime = realinode->i_mtime;
+	inode_set_atime_to_ts(inode, inode_get_atime(realinode));
+	inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
 	inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
 	i_size_write(inode, i_size_read(realinode));
+	spin_unlock(&inode->i_lock);
 }
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
new file mode 100644
index 000000000000..383978e4663c
--- /dev/null
+++ b/fs/overlayfs/xattrs.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/fs.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	if (ofs->config.userxattr)
+		return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX,
+			       OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0;
+	else
+		return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX,
+			       OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0;
+}
+
+static bool ovl_is_own_xattr(struct super_block *sb, const char *name)
+{
+	struct ovl_fs *ofs = OVL_FS(sb);
+
+	if (ofs->config.userxattr)
+		return strncmp(name, OVL_XATTR_USER_PREFIX,
+			       OVL_XATTR_USER_PREFIX_LEN) == 0;
+	else
+		return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
+			       OVL_XATTR_TRUSTED_PREFIX_LEN) == 0;
+}
+
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
+{
+	return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name);
+}
+
+static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
+			 const void *value, size_t size, int flags)
+{
+	int err;
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	struct dentry *upperdentry = ovl_i_dentry_upper(inode);
+	struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+	struct path realpath;
+	const struct cred *old_cred;
+
+	if (!value && !upperdentry) {
+		ovl_path_lower(dentry, &realpath);
+		old_cred = ovl_override_creds(dentry->d_sb);
+		err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
+		revert_creds(old_cred);
+		if (err < 0)
+			goto out;
+	}
+
+	if (!upperdentry) {
+		err = ovl_copy_up(dentry);
+		if (err)
+			goto out;
+
+		realdentry = ovl_dentry_upper(dentry);
+	}
+
+	err = ovl_want_write(dentry);
+	if (err)
+		goto out;
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	if (value) {
+		err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+				      flags);
+	} else {
+		WARN_ON(flags != XATTR_REPLACE);
+		err = ovl_do_removexattr(ofs, realdentry, name);
+	}
+	revert_creds(old_cred);
+	ovl_drop_write(dentry);
+
+	/* copy c/mtime */
+	ovl_copyattr(inode);
+out:
+	return err;
+}
+
+static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
+			 void *value, size_t size)
+{
+	ssize_t res;
+	const struct cred *old_cred;
+	struct path realpath;
+
+	ovl_i_path_real(inode, &realpath);
+	old_cred = ovl_override_creds(dentry->d_sb);
+	res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
+	revert_creds(old_cred);
+	return res;
+}
+
+static bool ovl_can_list(struct super_block *sb, const char *s)
+{
+	/* Never list private (.overlay) */
+	if (ovl_is_private_xattr(sb, s))
+		return false;
+
+	/* List all non-trusted xattrs */
+	if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
+		return true;
+
+	/* list other trusted for superuser only */
+	return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	struct dentry *realdentry = ovl_dentry_real(dentry);
+	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+	ssize_t res;
+	size_t len;
+	char *s;
+	const struct cred *old_cred;
+	size_t prefix_len, name_len;
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	res = vfs_listxattr(realdentry, list, size);
+	revert_creds(old_cred);
+	if (res <= 0 || size == 0)
+		return res;
+
+	prefix_len = ofs->config.userxattr ?
+		OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN;
+
+	/* filter out private xattrs */
+	for (s = list, len = res; len;) {
+		size_t slen = strnlen(s, len) + 1;
+
+		/* underlying fs providing us with an broken xattr list? */
+		if (WARN_ON(slen > len))
+			return -EIO;
+
+		len -= slen;
+		if (!ovl_can_list(dentry->d_sb, s)) {
+			res -= slen;
+			memmove(s, s + slen, len);
+		} else if (ovl_is_escaped_xattr(dentry->d_sb, s)) {
+			res -= OVL_XATTR_ESCAPE_PREFIX_LEN;
+			name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN;
+			s += prefix_len;
+			memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len);
+			s += name_len;
+		} else {
+			s += slen;
+		}
+	}
+
+	return res;
+}
+
+static char *ovl_xattr_escape_name(const char *prefix, const char *name)
+{
+	size_t prefix_len = strlen(prefix);
+	size_t name_len = strlen(name);
+	size_t escaped_len;
+	char *escaped, *s;
+
+	escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len;
+	if (escaped_len > XATTR_NAME_MAX)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	escaped = kmalloc(escaped_len + 1, GFP_KERNEL);
+	if (escaped == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	s = escaped;
+	memcpy(s, prefix, prefix_len);
+	s += prefix_len;
+	memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN);
+	s += OVL_XATTR_ESCAPE_PREFIX_LEN;
+	memcpy(s, name, name_len + 1);
+
+	return escaped;
+}
+
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
+			     struct dentry *dentry, struct inode *inode,
+			     const char *name, void *buffer, size_t size)
+{
+	char *escaped;
+	int r;
+
+	escaped = ovl_xattr_escape_name(handler->prefix, name);
+	if (IS_ERR(escaped))
+		return PTR_ERR(escaped);
+
+	r = ovl_xattr_get(dentry, inode, escaped, buffer, size);
+
+	kfree(escaped);
+
+	return r;
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+			     struct mnt_idmap *idmap,
+			     struct dentry *dentry, struct inode *inode,
+			     const char *name, const void *value,
+			     size_t size, int flags)
+{
+	char *escaped;
+	int r;
+
+	escaped = ovl_xattr_escape_name(handler->prefix, name);
+	if (IS_ERR(escaped))
+		return PTR_ERR(escaped);
+
+	r = ovl_xattr_set(dentry, inode, escaped, value, size, flags);
+
+	kfree(escaped);
+
+	return r;
+}
+
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+			       struct dentry *dentry, struct inode *inode,
+			       const char *name, void *buffer, size_t size)
+{
+	return ovl_xattr_get(dentry, inode, name, buffer, size);
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+			       struct mnt_idmap *idmap,
+			       struct dentry *dentry, struct inode *inode,
+			       const char *name, const void *value,
+			       size_t size, int flags)
+{
+	return ovl_xattr_set(dentry, inode, name, value, size, flags);
+}
+
+static const struct xattr_handler ovl_own_trusted_xattr_handler = {
+	.prefix	= OVL_XATTR_TRUSTED_PREFIX,
+	.get = ovl_own_xattr_get,
+	.set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_user_xattr_handler = {
+	.prefix	= OVL_XATTR_USER_PREFIX,
+	.get = ovl_own_xattr_get,
+	.set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+	.prefix	= "", /* catch all */
+	.get = ovl_other_xattr_get,
+	.set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
+	&ovl_own_trusted_xattr_handler,
+	&ovl_other_xattr_handler,
+	NULL
+};
+
+static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
+	&ovl_own_user_xattr_handler,
+	&ovl_other_xattr_handler,
+	NULL
+};
+
+const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs)
+{
+	return ofs->config.userxattr ? ovl_user_xattr_handlers :
+		ovl_trusted_xattr_handlers;
+}
+
diff --git a/fs/pipe.c b/fs/pipe.c
index 139190165a1c..804a7d789452 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
 	return !pipe_empty(head, tail) || !writers;
 }
 
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+					    struct pipe_buffer *buf,
+					    unsigned int tail)
+{
+	pipe_buf_release(pipe, buf);
+
+	/*
+	 * If the pipe has a watch_queue, we need additional protection
+	 * by the spinlock because notifications get posted with only
+	 * this spinlock, no mutex
+	 */
+	if (pipe_has_watch_queue(pipe)) {
+		spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+		if (buf->flags & PIPE_BUF_FLAG_LOSS)
+			pipe->note_loss = true;
+#endif
+		pipe->tail = ++tail;
+		spin_unlock_irq(&pipe->rd_wait.lock);
+		return tail;
+	}
+
+	/*
+	 * Without a watch_queue, we can simply increment the tail
+	 * without the spinlock - the mutex is enough.
+	 */
+	pipe->tail = ++tail;
+	return tail;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				buf->len = 0;
 			}
 
-			if (!buf->len) {
-				pipe_buf_release(pipe, buf);
-				spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
-				if (buf->flags & PIPE_BUF_FLAG_LOSS)
-					pipe->note_loss = true;
-#endif
-				tail++;
-				pipe->tail = tail;
-				spin_unlock_irq(&pipe->rd_wait.lock);
-			}
+			if (!buf->len)
+				tail = pipe_update_tail(pipe, buf, tail);
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */
@@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue) {
+	if (pipe_has_watch_queue(pipe)) {
 		ret = -EXDEV;
 		goto out;
 	}
-#endif
 
 	/*
 	 * If it wasn't empty we try to merge new data into
@@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 			 * it, either the reader will consume it or it'll still
 			 * be there for the next write.
 			 */
-			spin_lock_irq(&pipe->rd_wait.lock);
-
-			head = pipe->head;
-			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
-				spin_unlock_irq(&pipe->rd_wait.lock);
-				continue;
-			}
-
 			pipe->head = head + 1;
-			spin_unlock_irq(&pipe->rd_wait.lock);
 
 			/* Insert it into the buffer array */
 			buf = &pipe->bufs[head & mask];
@@ -854,7 +864,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 	kfree(pipe);
 }
 
-static struct vfsmount *pipe_mnt __read_mostly;
+static struct vfsmount *pipe_mnt __ro_after_init;
 
 /*
  * pipefs_dname() is called from d_path().
@@ -898,7 +908,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 
 	return inode;
 
@@ -1324,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
 	unsigned int nr_slots, size;
 	long ret = 0;
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue)
+	if (pipe_has_watch_queue(pipe))
 		return -EBUSY;
-#endif
 
 	size = round_pipe_size(arg);
 	nr_slots = size >> PAGE_SHIFT;
@@ -1379,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
 
 	if (file->f_op != &pipefifo_fops || !pipe)
 		return NULL;
-#ifdef CONFIG_WATCH_QUEUE
-	if (for_splice && pipe->watch_queue)
+	if (for_splice && pipe_has_watch_queue(pipe))
 		return NULL;
-#endif
 	return pipe;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2c2efbe685d8..ff08a8957552 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -536,12 +536,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_struct *t = task;
-			do {
+			struct task_struct *t;
+
+			__for_each_thread(sig, t) {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
 				gtime += task_gtime(t);
-			} while_each_thread(task, t);
+			}
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..dd31e3b6bf77 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1153,11 +1153,10 @@ err_unlock:
 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	char buffer[PROC_NUMBUF];
+	char buffer[PROC_NUMBUF] = {};
 	int oom_adj;
 	int err;
 
-	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count)) {
@@ -1213,11 +1212,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 					size_t count, loff_t *ppos)
 {
-	char buffer[PROC_NUMBUF];
+	char buffer[PROC_NUMBUF] = {};
 	int oom_score_adj;
 	int err;
 
-	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count)) {
@@ -1358,13 +1356,13 @@ static ssize_t proc_fault_inject_write(struct file * file,
 			const char __user * buf, size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
+	char buffer[PROC_NUMBUF] = {};
 	int make_it_fail;
 	int rv;
 
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	memset(buffer, 0, sizeof(buffer));
+
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
@@ -1509,11 +1507,10 @@ sched_autogroup_write(struct file *file, const char __user *buf,
 {
 	struct inode *inode = file_inode(file);
 	struct task_struct *p;
-	char buffer[PROC_NUMBUF];
+	char buffer[PROC_NUMBUF] = {};
 	int nice;
 	int err;
 
-	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
@@ -1666,10 +1663,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf,
 {
 	struct inode *inode = file_inode(file);
 	struct task_struct *p;
-	char buffer[TASK_COMM_LEN];
+	char buffer[TASK_COMM_LEN] = {};
 	const size_t maxlen = sizeof(buffer) - 1;
 
-	memset(buffer, 0, sizeof(buffer));
 	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
 		return -EFAULT;
 
@@ -1902,7 +1898,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
 	ei = PROC_I(inode);
 	inode->i_mode = mode;
 	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &proc_def_inode_operations;
 
 	/*
@@ -2218,7 +2214,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
 	}
@@ -2976,8 +2972,7 @@ static const struct file_operations proc_coredump_filter_operations = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
 {
-	struct task_io_accounting acct = task->ioac;
-	unsigned long flags;
+	struct task_io_accounting acct;
 	int result;
 
 	result = down_read_killable(&task->signal->exec_update_lock);
@@ -2989,15 +2984,28 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 		goto out_unlock;
 	}
 
-	if (whole && lock_task_sighand(task, &flags)) {
-		struct task_struct *t = task;
+	if (whole) {
+		struct signal_struct *sig = task->signal;
+		struct task_struct *t;
+		unsigned int seq = 1;
+		unsigned long flags;
+
+		rcu_read_lock();
+		do {
+			seq++; /* 2 on the 1st/lockless path, otherwise odd */
+			flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 
-		task_io_accounting_add(&acct, &task->signal->ioac);
-		while_each_thread(task, t)
-			task_io_accounting_add(&acct, &t->ioac);
+			acct = sig->ioac;
+			__for_each_thread(sig, t)
+				task_io_accounting_add(&acct, &t->ioac);
 
-		unlock_task_sighand(task, &flags);
+		} while (need_seqretry(&sig->stats_lock, seq));
+		done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+		rcu_read_unlock();
+	} else {
+		acct = task->ioac;
 	}
+
 	seq_printf(m,
 		   "rchar: %llu\n"
 		   "wchar: %llu\n"
@@ -3818,7 +3826,7 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
 	for_each_thread(task, pos) {
 		if (!nr--)
 			goto found;
-	};
+	}
 fail:
 	pos = NULL;
 	goto out;
@@ -3840,10 +3848,8 @@ static struct task_struct *next_tid(struct task_struct *start)
 	struct task_struct *pos = NULL;
 	rcu_read_lock();
 	if (pid_alive(start)) {
-		pos = next_thread(start);
-		if (thread_group_leader(pos))
-			pos = NULL;
-		else
+		pos = __next_thread(start);
+		if (pos)
 			get_task_struct(pos);
 	}
 	rcu_read_unlock();
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 2e244ada1f97..902b326e1e56 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,6 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 				break;
 			dst += ret;
 		}
+		if (ret >= 0 && boot_command_line[0]) {
+			ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+				       boot_command_line);
+			if (ret > 0)
+				dst += ret;
+		}
 	}
 out:
 	kfree(key);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, fd);
-	if (file)
-		*mode = file->f_mode;
+	file = task_lookup_fdget_rcu(task, fd);
 	rcu_read_unlock();
+	if (file) {
+		*mode = file->f_mode;
+		fput(file);
+	}
 	return !!file;
 }
 
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fd_rcu(p, &fd);
+		f = task_lookup_next_fdget_rcu(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
 		rcu_read_unlock();
+		fput(f);
 		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 532dc9d240f7..b33e490e3fd9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -110,18 +110,15 @@ void __init proc_init_kmemcache(void)
 
 void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
 {
-	struct inode *inode;
-	struct proc_inode *ei;
 	struct hlist_node *node;
 	struct super_block *old_sb = NULL;
 
 	rcu_read_lock();
-	for (;;) {
+	while ((node = hlist_first_rcu(inodes))) {
+		struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
 		struct super_block *sb;
-		node = hlist_first_rcu(inodes);
-		if (!node)
-			break;
-		ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+		struct inode *inode;
+
 		spin_lock(lock);
 		hlist_del_init_rcu(&ei->sibling_inodes);
 		spin_unlock(lock);
@@ -660,7 +657,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 
 	inode->i_private = de->data;
 	inode->i_ino = de->low_ino;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	PROC_I(inode)->pde = de;
 	if (is_empty_pde(de)) {
 		make_empty_dir_inode(inode);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 23fc24d16b31..6422e569b080 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
 			 * and explicitly excluded physical ranges.
 			 */
 			if (!page || PageOffline(page) ||
-			    is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+			    is_page_hwpoison(page) || !pfn_is_ram(pfn) ||
+			    pfn_is_unaccepted_memory(pfn)) {
 				if (iov_iter_zero(tsz, iter) != tsz) {
 					ret = -EFAULT;
 					goto out;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c88854df0b62..8064ea76f80b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -465,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	head->count++;
 	spin_unlock(&sysctl_lock);
 
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mode = table->mode;
 	if (!S_ISDIR(table->mode)) {
 		inode->i_mode |= S_IFREG;
@@ -1576,7 +1576,6 @@ static const struct sysctl_alias sysctl_aliases[] = {
 	{"hung_task_panic",			"kernel.hung_task_panic" },
 	{"numa_zonelist_order",			"vm.numa_zonelist_order" },
 	{"softlockup_all_cpu_backtrace",	"kernel.softlockup_all_cpu_backtrace" },
-	{"softlockup_panic",			"kernel.softlockup_panic" },
 	{ }
 };
 
@@ -1592,6 +1591,13 @@ static const char *sysctl_find_alias(char *param)
 	return NULL;
 }
 
+bool sysctl_is_alias(char *param)
+{
+	const char *alias = sysctl_find_alias(param);
+
+	return alias != NULL;
+}
+
 /* Set sysctl value passed on kernel command line. */
 static int process_sysctl_arg(char *param, char *val,
 			       const char *unused, void *arg)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9191248f2dac..b55dbc70287b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
 
 	/* procfs dentries and inodes don't require IO to create */
-	s->s_shrink.seeks = 0;
+	s->s_shrink->seeks = 0;
 
 	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ecc4da8d265e..b46fbfd22681 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
 		struct inode *inode = new_inode(s);
 		if (inode) {
 			inode->i_ino = self_inum;
-			inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+			simple_inode_init_ts(inode);
 			inode->i_mode = S_IFLNK | S_IRWXUGO;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..435b61054b5b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,8 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/minmax.h>
+#include <linux/overflow.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -296,7 +298,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		if (anon_name)
 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
 		else
-			seq_file_path(m, file, "\n");
+			seq_path(m, file_user_path(file), "\n");
 		goto done;
 	}
 
@@ -849,9 +851,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
-	struct mem_size_stats mss;
-
-	memset(&mss, 0, sizeof(mss));
+	struct mem_size_stats mss = {};
 
 	smap_gather_stats(vma, &mss, 0);
 
@@ -877,7 +877,7 @@ static int show_smap(struct seq_file *m, void *v)
 static int show_smaps_rollup(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
-	struct mem_size_stats mss;
+	struct mem_size_stats mss = {};
 	struct mm_struct *mm = priv->mm;
 	struct vm_area_struct *vma;
 	unsigned long vma_start = 0, last_vma_end = 0;
@@ -893,8 +893,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 		goto out_put_task;
 	}
 
-	memset(&mss, 0, sizeof(mss));
-
 	ret = mmap_read_lock_killable(mm);
 	if (ret)
 		goto out_put_mm;
@@ -1246,14 +1244,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
+	char buffer[PROC_NUMBUF] = {};
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
 	int itype;
 	int rv;
 
-	memset(buffer, 0, sizeof(buffer));
 	if (count > sizeof(buffer) - 1)
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
@@ -1761,11 +1758,753 @@ static int pagemap_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+#define PM_SCAN_CATEGORIES	(PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |	\
+				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
+				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
+				 PAGE_IS_HUGE)
+#define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+
+struct pagemap_scan_private {
+	struct pm_scan_arg arg;
+	unsigned long masks_of_interest, cur_vma_category;
+	struct page_region *vec_buf;
+	unsigned long vec_buf_len, vec_buf_index, found_pages;
+	struct page_region __user *vec_out;
+};
+
+static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+					   struct vm_area_struct *vma,
+					   unsigned long addr, pte_t pte)
+{
+	unsigned long categories = 0;
+
+	if (pte_present(pte)) {
+		struct page *page;
+
+		categories |= PAGE_IS_PRESENT;
+		if (!pte_uffd_wp(pte))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			page = vm_normal_page(vma, addr, pte);
+			if (page && !PageAnon(page))
+				categories |= PAGE_IS_FILE;
+		}
+
+		if (is_zero_pfn(pte_pfn(pte)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pte(pte)) {
+		swp_entry_t swp;
+
+		categories |= PAGE_IS_SWAPPED;
+		if (!pte_swp_uffd_wp_any(pte))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			swp = pte_to_swp_entry(pte);
+			if (is_pfn_swap_entry(swp) &&
+			    !PageAnon(pfn_swap_entry_to_page(swp)))
+				categories |= PAGE_IS_FILE;
+		}
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_pte(struct vm_area_struct *vma,
+			     unsigned long addr, pte_t *pte)
+{
+	pte_t ptent = ptep_get(pte);
+
+	if (pte_present(ptent)) {
+		pte_t old_pte;
+
+		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ptent = pte_mkuffd_wp(ptent);
+		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+	} else if (is_swap_pte(ptent)) {
+		ptent = pte_swp_mkuffd_wp(ptent);
+		set_pte_at(vma->vm_mm, addr, pte, ptent);
+	} else {
+		set_pte_at(vma->vm_mm, addr, pte,
+			   make_pte_marker(PTE_MARKER_UFFD_WP));
+	}
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+					  struct vm_area_struct *vma,
+					  unsigned long addr, pmd_t pmd)
+{
+	unsigned long categories = PAGE_IS_HUGE;
+
+	if (pmd_present(pmd)) {
+		struct page *page;
+
+		categories |= PAGE_IS_PRESENT;
+		if (!pmd_uffd_wp(pmd))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			page = vm_normal_page_pmd(vma, addr, pmd);
+			if (page && !PageAnon(page))
+				categories |= PAGE_IS_FILE;
+		}
+
+		if (is_zero_pfn(pmd_pfn(pmd)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pmd(pmd)) {
+		swp_entry_t swp;
+
+		categories |= PAGE_IS_SWAPPED;
+		if (!pmd_swp_uffd_wp(pmd))
+			categories |= PAGE_IS_WRITTEN;
+
+		if (p->masks_of_interest & PAGE_IS_FILE) {
+			swp = pmd_to_swp_entry(pmd);
+			if (is_pfn_swap_entry(swp) &&
+			    !PageAnon(pfn_swap_entry_to_page(swp)))
+				categories |= PAGE_IS_FILE;
+		}
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+			     unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t old, pmd = *pmdp;
+
+	if (pmd_present(pmd)) {
+		old = pmdp_invalidate_ad(vma, addr, pmdp);
+		pmd = pmd_mkuffd_wp(old);
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		pmd = pmd_swp_mkuffd_wp(pmd);
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	}
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long pagemap_hugetlb_category(pte_t pte)
+{
+	unsigned long categories = PAGE_IS_HUGE;
+
+	/*
+	 * According to pagemap_hugetlb_range(), file-backed HugeTLB
+	 * page cannot be swapped. So PAGE_IS_FILE is not checked for
+	 * swapped pages.
+	 */
+	if (pte_present(pte)) {
+		categories |= PAGE_IS_PRESENT;
+		if (!huge_pte_uffd_wp(pte))
+			categories |= PAGE_IS_WRITTEN;
+		if (!PageAnon(pte_page(pte)))
+			categories |= PAGE_IS_FILE;
+		if (is_zero_pfn(pte_pfn(pte)))
+			categories |= PAGE_IS_PFNZERO;
+	} else if (is_swap_pte(pte)) {
+		categories |= PAGE_IS_SWAPPED;
+		if (!pte_swp_uffd_wp_any(pte))
+			categories |= PAGE_IS_WRITTEN;
+	}
+
+	return categories;
+}
+
+static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep,
+				  pte_t ptent)
+{
+	unsigned long psize;
+
+	if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+		return;
+
+	psize = huge_page_size(hstate_vma(vma));
+
+	if (is_hugetlb_entry_migration(ptent))
+		set_huge_pte_at(vma->vm_mm, addr, ptep,
+				pte_swp_mkuffd_wp(ptent), psize);
+	else if (!huge_pte_none(ptent))
+		huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+					     huge_pte_mkuffd_wp(ptent));
+	else
+		set_huge_pte_at(vma->vm_mm, addr, ptep,
+				make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+				       unsigned long addr, unsigned long end)
+{
+	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+	if (cur_buf->start != addr)
+		cur_buf->end = addr;
+	else
+		cur_buf->start = cur_buf->end = 0;
+
+	p->found_pages -= (end - addr) / PAGE_SIZE;
+}
+#endif
+
+static bool pagemap_scan_is_interesting_page(unsigned long categories,
+					     const struct pagemap_scan_private *p)
+{
+	categories ^= p->arg.category_inverted;
+	if ((categories & p->arg.category_mask) != p->arg.category_mask)
+		return false;
+	if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+		return false;
+
+	return true;
+}
+
+static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+					    const struct pagemap_scan_private *p)
+{
+	unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+
+	categories ^= p->arg.category_inverted;
+	if ((categories & required) != required)
+		return false;
+
+	return true;
+}
+
+static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+				  struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long vma_category = 0;
+	bool wp_allowed = userfaultfd_wp_async(vma) &&
+	    userfaultfd_wp_use_markers(vma);
+
+	if (!wp_allowed) {
+		/* User requested explicit failure over wp-async capability */
+		if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+			return -EPERM;
+		/*
+		 * User requires wr-protect, and allows silently skipping
+		 * unsupported vmas.
+		 */
+		if (p->arg.flags & PM_SCAN_WP_MATCHING)
+			return 1;
+		/*
+		 * Then the request doesn't involve wr-protects at all,
+		 * fall through to the rest checks, and allow vma walk.
+		 */
+	}
+
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+
+	if (wp_allowed)
+		vma_category |= PAGE_IS_WPALLOWED;
+
+	if (!pagemap_scan_is_interesting_vma(vma_category, p))
+		return 1;
+
+	p->cur_vma_category = vma_category;
+
+	return 0;
+}
+
+static bool pagemap_scan_push_range(unsigned long categories,
+				    struct pagemap_scan_private *p,
+				    unsigned long addr, unsigned long end)
+{
+	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+
+	/*
+	 * When there is no output buffer provided at all, the sentinel values
+	 * won't match here. There is no other way for `cur_buf->end` to be
+	 * non-zero other than it being non-empty.
+	 */
+	if (addr == cur_buf->end && categories == cur_buf->categories) {
+		cur_buf->end = end;
+		return true;
+	}
+
+	if (cur_buf->end) {
+		if (p->vec_buf_index >= p->vec_buf_len - 1)
+			return false;
+
+		cur_buf = &p->vec_buf[++p->vec_buf_index];
+	}
+
+	cur_buf->start = addr;
+	cur_buf->end = end;
+	cur_buf->categories = categories;
+
+	return true;
+}
+
+static int pagemap_scan_output(unsigned long categories,
+			       struct pagemap_scan_private *p,
+			       unsigned long addr, unsigned long *end)
+{
+	unsigned long n_pages, total_pages;
+	int ret = 0;
+
+	if (!p->vec_buf)
+		return 0;
+
+	categories &= p->arg.return_mask;
+
+	n_pages = (*end - addr) / PAGE_SIZE;
+	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+	    total_pages > p->arg.max_pages) {
+		size_t n_too_much = total_pages - p->arg.max_pages;
+		*end -= n_too_much * PAGE_SIZE;
+		n_pages -= n_too_much;
+		ret = -ENOSPC;
+	}
+
+	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+		*end = addr;
+		n_pages = 0;
+		ret = -ENOSPC;
+	}
+
+	p->found_pages += n_pages;
+	if (ret)
+		p->arg.walk_end = *end;
+
+	return ret;
+}
+
+static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+				  unsigned long end, struct mm_walk *walk)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long categories;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (!ptl)
+		return -ENOENT;
+
+	categories = p->cur_vma_category |
+		     pagemap_thp_category(p, vma, start, *pmd);
+
+	if (!pagemap_scan_is_interesting_page(categories, p))
+		goto out_unlock;
+
+	ret = pagemap_scan_output(categories, p, start, &end);
+	if (start == end)
+		goto out_unlock;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+		goto out_unlock;
+	if (~categories & PAGE_IS_WRITTEN)
+		goto out_unlock;
+
+	/*
+	 * Break huge page into small pages if the WP operation
+	 * needs to be performed on a portion of the huge page.
+	 */
+	if (end != start + HPAGE_SIZE) {
+		spin_unlock(ptl);
+		split_huge_pmd(vma, pmd, start);
+		pagemap_scan_backout_range(p, start, end);
+		/* Report as if there was no THP */
+		return -ENOENT;
+	}
+
+	make_uffd_wp_pmd(vma, start, pmd);
+	flush_tlb_range(vma, start, end);
+out_unlock:
+	spin_unlock(ptl);
+	return ret;
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+	return -ENOENT;
+#endif
+}
+
+static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+				  unsigned long end, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long addr, flush_end = 0;
+	pte_t *pte, *start_pte;
+	spinlock_t *ptl;
+	int ret;
+
+	arch_enter_lazy_mmu_mode();
+
+	ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+	if (ret != -ENOENT) {
+		arch_leave_lazy_mmu_mode();
+		return ret;
+	}
+
+	ret = 0;
+	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+	if (!pte) {
+		arch_leave_lazy_mmu_mode();
+		walk->action = ACTION_AGAIN;
+		return 0;
+	}
+
+	if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
+		/* Fast path for performing exclusive WP */
+		for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+			if (pte_uffd_wp(ptep_get(pte)))
+				continue;
+			make_uffd_wp_pte(vma, addr, pte);
+			if (!flush_end)
+				start = addr;
+			flush_end = addr + PAGE_SIZE;
+		}
+		goto flush_and_return;
+	}
+
+	if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+	    p->arg.category_mask == PAGE_IS_WRITTEN &&
+	    p->arg.return_mask == PAGE_IS_WRITTEN) {
+		for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+			unsigned long next = addr + PAGE_SIZE;
+
+			if (pte_uffd_wp(ptep_get(pte)))
+				continue;
+			ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+						  p, addr, &next);
+			if (next == addr)
+				break;
+			if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+				continue;
+			make_uffd_wp_pte(vma, addr, pte);
+			if (!flush_end)
+				start = addr;
+			flush_end = next;
+		}
+		goto flush_and_return;
+	}
+
+	for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+		unsigned long categories = p->cur_vma_category |
+					   pagemap_page_category(p, vma, addr, ptep_get(pte));
+		unsigned long next = addr + PAGE_SIZE;
+
+		if (!pagemap_scan_is_interesting_page(categories, p))
+			continue;
+
+		ret = pagemap_scan_output(categories, p, addr, &next);
+		if (next == addr)
+			break;
+
+		if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+			continue;
+		if (~categories & PAGE_IS_WRITTEN)
+			continue;
+
+		make_uffd_wp_pte(vma, addr, pte);
+		if (!flush_end)
+			start = addr;
+		flush_end = next;
+	}
+
+flush_and_return:
+	if (flush_end)
+		flush_tlb_range(vma, start, addr);
+
+	pte_unmap_unlock(start_pte, ptl);
+	arch_leave_lazy_mmu_mode();
+
+	cond_resched();
+	return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+				      unsigned long start, unsigned long end,
+				      struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long categories;
+	spinlock_t *ptl;
+	int ret = 0;
+	pte_t pte;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+		/* Go the short route when not write-protecting pages. */
+
+		pte = huge_ptep_get(ptep);
+		categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+		if (!pagemap_scan_is_interesting_page(categories, p))
+			return 0;
+
+		return pagemap_scan_output(categories, p, start, &end);
+	}
+
+	i_mmap_lock_write(vma->vm_file->f_mapping);
+	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+
+	pte = huge_ptep_get(ptep);
+	categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+
+	if (!pagemap_scan_is_interesting_page(categories, p))
+		goto out_unlock;
+
+	ret = pagemap_scan_output(categories, p, start, &end);
+	if (start == end)
+		goto out_unlock;
+
+	if (~categories & PAGE_IS_WRITTEN)
+		goto out_unlock;
+
+	if (end != start + HPAGE_SIZE) {
+		/* Partial HugeTLB page WP isn't possible. */
+		pagemap_scan_backout_range(p, start, end);
+		p->arg.walk_end = start;
+		ret = 0;
+		goto out_unlock;
+	}
+
+	make_uffd_wp_huge_pte(vma, start, ptep, pte);
+	flush_hugetlb_tlb_range(vma, start, end);
+
+out_unlock:
+	spin_unlock(ptl);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
+
+	return ret;
+}
+#else
+#define pagemap_scan_hugetlb_entry NULL
+#endif
+
+static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+				 int depth, struct mm_walk *walk)
+{
+	struct pagemap_scan_private *p = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	int ret, err;
+
+	if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+		return 0;
+
+	ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+	if (addr == end)
+		return ret;
+
+	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+		return ret;
+
+	err = uffd_wp_range(vma, addr, end - addr, true);
+	if (err < 0)
+		ret = err;
+
+	return ret;
+}
+
+static const struct mm_walk_ops pagemap_scan_ops = {
+	.test_walk = pagemap_scan_test_walk,
+	.pmd_entry = pagemap_scan_pmd_entry,
+	.pte_hole = pagemap_scan_pte_hole,
+	.hugetlb_entry = pagemap_scan_hugetlb_entry,
+};
+
+static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+				 unsigned long uarg)
+{
+	if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+		return -EFAULT;
+
+	if (arg->size != sizeof(struct pm_scan_arg))
+		return -EINVAL;
+
+	/* Validate requested features */
+	if (arg->flags & ~PM_SCAN_FLAGS)
+		return -EINVAL;
+	if ((arg->category_inverted | arg->category_mask |
+	     arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+		return -EINVAL;
+
+	arg->start = untagged_addr((unsigned long)arg->start);
+	arg->end = untagged_addr((unsigned long)arg->end);
+	arg->vec = untagged_addr((unsigned long)arg->vec);
+
+	/* Validate memory pointers */
+	if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+		return -EINVAL;
+	if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+		return -EFAULT;
+	if (!arg->vec && arg->vec_len)
+		return -EINVAL;
+	if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+			      arg->vec_len * sizeof(struct page_region)))
+		return -EFAULT;
+
+	/* Fixup default values */
+	arg->end = ALIGN(arg->end, PAGE_SIZE);
+	arg->walk_end = 0;
+	if (!arg->max_pages)
+		arg->max_pages = ULONG_MAX;
+
+	return 0;
+}
+
+static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+				       unsigned long uargl)
+{
+	struct pm_scan_arg __user *uarg	= (void __user *)uargl;
+
+	if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+{
+	if (!p->arg.vec_len)
+		return 0;
+
+	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+			       p->arg.vec_len);
+	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+				   GFP_KERNEL);
+	if (!p->vec_buf)
+		return -ENOMEM;
+
+	p->vec_buf->start = p->vec_buf->end = 0;
+	p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+
+	return 0;
+}
+
+static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+{
+	const struct page_region *buf = p->vec_buf;
+	long n = p->vec_buf_index;
+
+	if (!p->vec_buf)
+		return 0;
+
+	if (buf[n].end != buf[n].start)
+		n++;
+
+	if (!n)
+		return 0;
+
+	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+		return -EFAULT;
+
+	p->arg.vec_len -= n;
+	p->vec_out += n;
+
+	p->vec_buf_index = 0;
+	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+	p->vec_buf->start = p->vec_buf->end = 0;
+
+	return n;
+}
+
+static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+{
+	struct mmu_notifier_range range;
+	struct pagemap_scan_private p = {0};
+	unsigned long walk_start;
+	size_t n_ranges_out = 0;
+	int ret;
+
+	ret = pagemap_scan_get_args(&p.arg, uarg);
+	if (ret)
+		return ret;
+
+	p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+			      p.arg.return_mask;
+	ret = pagemap_scan_init_bounce_buffer(&p);
+	if (ret)
+		return ret;
+
+	/* Protection change for the range is going to happen. */
+	if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+					mm, p.arg.start, p.arg.end);
+		mmu_notifier_invalidate_range_start(&range);
+	}
+
+	for (walk_start = p.arg.start; walk_start < p.arg.end;
+			walk_start = p.arg.walk_end) {
+		long n_out;
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = mmap_read_lock_killable(mm);
+		if (ret)
+			break;
+		ret = walk_page_range(mm, walk_start, p.arg.end,
+				      &pagemap_scan_ops, &p);
+		mmap_read_unlock(mm);
+
+		n_out = pagemap_scan_flush_buffer(&p);
+		if (n_out < 0)
+			ret = n_out;
+		else
+			n_ranges_out += n_out;
+
+		if (ret != -ENOSPC)
+			break;
+
+		if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+			break;
+	}
+
+	/* ENOSPC signifies early stop (buffer full) from the walk. */
+	if (!ret || ret == -ENOSPC)
+		ret = n_ranges_out;
+
+	/* The walk_end isn't set when ret is zero */
+	if (!p.arg.walk_end)
+		p.arg.walk_end = p.arg.end;
+	if (pagemap_scan_writeback_args(&p.arg, uarg))
+		ret = -EFAULT;
+
+	if (p.arg.flags & PM_SCAN_WP_MATCHING)
+		mmu_notifier_invalidate_range_end(&range);
+
+	kfree(p.vec_buf);
+	return ret;
+}
+
+static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct mm_struct *mm = file->private_data;
+
+	switch (cmd) {
+	case PAGEMAP_SCAN:
+		return do_pagemap_scan(mm, arg);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
 	.open		= pagemap_open,
 	.release	= pagemap_release,
+	.unlocked_ioctl = do_pagemap_cmd,
+	.compat_ioctl	= do_pagemap_cmd,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
@@ -1945,8 +2684,9 @@ static int show_numa_map(struct seq_file *m, void *v)
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
-	struct mempolicy *pol;
 	char buffer[64];
+	struct mempolicy *pol;
+	pgoff_t ilx;
 	int nid;
 
 	if (!mm)
@@ -1955,7 +2695,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 	/* Ensure we start with an empty set of numa_maps statistics. */
 	memset(md, 0, sizeof(*md));
 
-	pol = __get_vma_policy(vma, vma->vm_start);
+	pol = __get_vma_policy(vma, vma->vm_start, &ilx);
 	if (pol) {
 		mpol_to_str(buffer, sizeof(buffer), pol);
 		mpol_cond_put(pol);
@@ -1967,7 +2707,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_file_path(m, file, "\n\t= ");
+		seq_path(m, file_user_path(file), "\n\t= ");
 	} else if (vma_is_initial_heap(vma)) {
 		seq_puts(m, " heap");
 	} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7cebd397cc26..bce674533000 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	} else if (mm && vma_is_initial_stack(vma)) {
 		seq_pad(m, ' ');
 		seq_puts(m, "[stack]");
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 63ac1f93289f..0e5050d6ab64 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
 		struct inode *inode = new_inode(s);
 		if (inode) {
 			inode->i_ino = thread_self_inum;
-			inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+			simple_inode_init_ts(inode);
 			inode->i_mode = S_IFLNK | S_IRWXUGO;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 585360706b33..d41c20d1b5e8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
@@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 	inode->i_private = private;
 
 	if (record->time.tv_sec)
-		inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
+		inode_set_mtime_to_ts(inode,
+				      inode_set_ctime_to_ts(inode, record->time));
 
 	d_add(dentry, inode);
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e5bca9a004cc..03425928d2fb 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -464,6 +464,8 @@ out:
  */
 int pstore_register(struct pstore_info *psi)
 {
+	char *new_backend;
+
 	if (backend && strcmp(backend, psi->name)) {
 		pr_warn("backend '%s' already in use: ignoring '%s'\n",
 			backend, psi->name);
@@ -484,11 +486,16 @@ int pstore_register(struct pstore_info *psi)
 		return -EINVAL;
 	}
 
+	new_backend = kstrdup(psi->name, GFP_KERNEL);
+	if (!new_backend)
+		return -ENOMEM;
+
 	mutex_lock(&psinfo_lock);
 	if (psinfo) {
 		pr_warn("backend '%s' already loaded: ignoring '%s'\n",
 			psinfo->name, psi->name);
 		mutex_unlock(&psinfo_lock);
+		kfree(new_backend);
 		return -EBUSY;
 	}
 
@@ -521,7 +528,7 @@ int pstore_register(struct pstore_info *psi)
 	 * Update the module parameter backend, so it is visible
 	 * through /sys/module/pstore/parameters/backend
 	 */
-	backend = kstrdup(psi->name, GFP_KERNEL);
+	backend = new_backend;
 
 	pr_info("Registered %s as persistent store backend\n", psi->name);
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a7171f5532a1..6eb9bb369b57 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -301,10 +301,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 	i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
 	set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
 	inode->i_size    = le32_to_cpu(raw_inode->di_size);
-	inode->i_mtime.tv_sec   = le32_to_cpu(raw_inode->di_mtime);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_sec   = le32_to_cpu(raw_inode->di_atime);
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0);
+	inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0);
 	inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
 	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 21f90d519f1a..a286c545717f 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -558,10 +558,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
 	i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
 	i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
 	inode->i_size    = fs64_to_cpu(sbi, raw_inode->di_size);
-	inode->i_mtime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_mtime);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_atime);
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0);
+	inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0);
 	inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
 
 	/* calc blocks based on 512 byte blocksize */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 31e897ad5e6a..58b5de081b57 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -803,12 +803,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 	percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
 }
 
-static struct shrinker dqcache_shrinker = {
-	.count_objects = dqcache_shrink_count,
-	.scan_objects = dqcache_shrink_scan,
-	.seeks = DEFAULT_SEEKS,
-};
-
 /*
  * Safely release dquot and put reference to dquot.
  */
@@ -2351,6 +2345,20 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
 	if (sb_has_quota_loaded(sb, type))
 		return -EBUSY;
 
+	/*
+	 * Quota files should never be encrypted.  They should be thought of as
+	 * filesystem metadata, not user data.  New-style internal quota files
+	 * cannot be encrypted by users anyway, but old-style external quota
+	 * files could potentially be incorrectly created in an encrypted
+	 * directory, hence this explicit check.  Some reasons why encrypted
+	 * quota files don't work include: (1) some filesystems that support
+	 * encryption don't handle it in their quota_read and quota_write, and
+	 * (2) cleaning up encrypted quota files at unmount would need special
+	 * consideration, as quota files are cleaned up later than user files.
+	 */
+	if (IS_ENCRYPTED(inode))
+		return -EINVAL;
+
 	dqopt->files[type] = igrab(inode);
 	if (!dqopt->files[type])
 		return -EIO;
@@ -2968,6 +2976,7 @@ static int __init dquot_init(void)
 {
 	int i, ret;
 	unsigned long nr_hash, order;
+	struct shrinker *dqcache_shrinker;
 
 	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
 
@@ -3002,8 +3011,14 @@ static int __init dquot_init(void)
 	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
 		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
 
-	if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
-		panic("Cannot register dquot shrinker");
+	dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
+	if (!dqcache_shrinker)
+		panic("Cannot allocate dquot shrinker");
+
+	dqcache_shrinker->count_objects = dqcache_shrink_count;
+	dqcache_shrinker->scan_objects = dqcache_shrink_scan;
+
+	shrinker_register(dqcache_shrinker);
 
 	return 0;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 18e8387cab41..4ac05a9e25bc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &ram_aops;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	}
 	return error;
 }
@@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		if (!error) {
 			d_instantiate(dentry, inode);
 			dget(dentry);
-			dir->i_mtime = inode_set_ctime_current(dir);
+			inode_set_mtime_to_ts(dir,
+					      inode_set_ctime_current(dir));
 		} else
 			iput(inode);
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 86e55d4bb10d..1d825459ee6e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1257,11 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		i_uid_write(inode, sd_v1_uid(sd));
 		i_gid_write(inode, sd_v1_gid(sd));
 		inode->i_size = sd_v1_size(sd);
-		inode->i_atime.tv_sec = sd_v1_atime(sd);
-		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+		inode_set_atime(inode, sd_v1_atime(sd), 0);
+		inode_set_mtime(inode, sd_v1_mtime(sd), 0);
 		inode_set_ctime(inode, sd_v1_ctime(sd), 0);
-		inode->i_atime.tv_nsec = 0;
-		inode->i_mtime.tv_nsec = 0;
 
 		inode->i_blocks = sd_v1_blocks(sd);
 		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -1311,11 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		i_uid_write(inode, sd_v2_uid(sd));
 		inode->i_size = sd_v2_size(sd);
 		i_gid_write(inode, sd_v2_gid(sd));
-		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
-		inode->i_atime.tv_sec = sd_v2_atime(sd);
+		inode_set_mtime(inode, sd_v2_mtime(sd), 0);
+		inode_set_atime(inode, sd_v2_atime(sd), 0);
 		inode_set_ctime(inode, sd_v2_ctime(sd), 0);
-		inode->i_mtime.tv_nsec = 0;
-		inode->i_atime.tv_nsec = 0;
 		inode->i_blocks = sd_v2_blocks(sd);
 		rdev = sd_v2_rdev(sd);
 		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1370,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
 	set_sd_v2_uid(sd_v2, i_uid_read(inode));
 	set_sd_v2_size(sd_v2, size);
 	set_sd_v2_gid(sd_v2, i_gid_read(inode));
-	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
-	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
-	set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
+	set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
+	set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
+	set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
 	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1391,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
 	set_sd_v1_gid(sd_v1, i_gid_read(inode));
 	set_sd_v1_nlink(sd_v1, inode->i_nlink);
 	set_sd_v1_size(sd_v1, size);
-	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
-	set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
-	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+	set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
+	set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
+	set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
 
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
@@ -1984,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	/* uid and gid must already be set by the caller for quota init */
 
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_size = i_size;
 	inode->i_blocks = 0;
 	inode->i_bytes = 0;
@@ -2507,10 +2503,10 @@ out:
  * start/recovery path as __block_write_full_folio, along with special
  * code to handle reiserfs tails.
  */
-static int reiserfs_write_full_page(struct page *page,
+static int reiserfs_write_full_folio(struct folio *folio,
 				    struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	unsigned long end_index = inode->i_size >> PAGE_SHIFT;
 	int error = 0;
 	unsigned long block;
@@ -2518,7 +2514,7 @@ static int reiserfs_write_full_page(struct page *page,
 	struct buffer_head *head, *bh;
 	int partial = 0;
 	int nr = 0;
-	int checked = PageChecked(page);
+	int checked = folio_test_checked(folio);
 	struct reiserfs_transaction_handle th;
 	struct super_block *s = inode->i_sb;
 	int bh_per_page = PAGE_SIZE / s->s_blocksize;
@@ -2526,47 +2522,46 @@ static int reiserfs_write_full_page(struct page *page,
 
 	/* no logging allowed when nonblocking or from PF_MEMALLOC */
 	if (checked && (current->flags & PF_MEMALLOC)) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
+		folio_redirty_for_writepage(wbc, folio);
+		folio_unlock(folio);
 		return 0;
 	}
 
 	/*
-	 * The page dirty bit is cleared before writepage is called, which
+	 * The folio dirty bit is cleared before writepage is called, which
 	 * means we have to tell create_empty_buffers to make dirty buffers
-	 * The page really should be up to date at this point, so tossing
+	 * The folio really should be up to date at this point, so tossing
 	 * in the BH_Uptodate is just a sanity check.
 	 */
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, s->s_blocksize,
+	head = folio_buffers(folio);
+	if (!head)
+		head = create_empty_buffers(folio, s->s_blocksize,
 				     (1 << BH_Dirty) | (1 << BH_Uptodate));
-	}
-	head = page_buffers(page);
 
 	/*
-	 * last page in the file, zero out any contents past the
+	 * last folio in the file, zero out any contents past the
 	 * last byte in the file
 	 */
-	if (page->index >= end_index) {
+	if (folio->index >= end_index) {
 		unsigned last_offset;
 
 		last_offset = inode->i_size & (PAGE_SIZE - 1);
-		/* no file contents in this page */
-		if (page->index >= end_index + 1 || !last_offset) {
-			unlock_page(page);
+		/* no file contents in this folio */
+		if (folio->index >= end_index + 1 || !last_offset) {
+			folio_unlock(folio);
 			return 0;
 		}
-		zero_user_segment(page, last_offset, PAGE_SIZE);
+		folio_zero_segment(folio, last_offset, folio_size(folio));
 	}
 	bh = head;
-	block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
+	block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 	/* first map all the buffers, logging any direct items we find */
 	do {
 		if (block > last_block) {
 			/*
 			 * This can happen when the block size is less than
-			 * the page size.  The corresponding bytes in the page
+			 * the folio size.  The corresponding bytes in the folio
 			 * were zero filled above
 			 */
 			clear_buffer_dirty(bh);
@@ -2593,7 +2588,7 @@ static int reiserfs_write_full_page(struct page *page,
 	 * blocks we're going to log
 	 */
 	if (checked) {
-		ClearPageChecked(page);
+		folio_clear_checked(folio);
 		reiserfs_write_lock(s);
 		error = journal_begin(&th, s, bh_per_page + 1);
 		if (error) {
@@ -2602,7 +2597,7 @@ static int reiserfs_write_full_page(struct page *page,
 		}
 		reiserfs_update_inode_transaction(inode);
 	}
-	/* now go through and lock any dirty buffers on the page */
+	/* now go through and lock any dirty buffers on the folio */
 	do {
 		get_bh(bh);
 		if (!buffer_mapped(bh))
@@ -2623,7 +2618,7 @@ static int reiserfs_write_full_page(struct page *page,
 			lock_buffer(bh);
 		} else {
 			if (!trylock_buffer(bh)) {
-				redirty_page_for_writepage(wbc, page);
+				folio_redirty_for_writepage(wbc, folio);
 				continue;
 			}
 		}
@@ -2640,13 +2635,13 @@ static int reiserfs_write_full_page(struct page *page,
 		if (error)
 			goto fail;
 	}
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	unlock_page(page);
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+	folio_unlock(folio);
 
 	/*
-	 * since any buffer might be the only dirty buffer on the page,
-	 * the first submit_bh can bring the page out of writeback.
+	 * since any buffer might be the only dirty buffer on the folio,
+	 * the first submit_bh can bring the folio out of writeback.
 	 * be careful with the buffers.
 	 */
 	do {
@@ -2663,10 +2658,10 @@ static int reiserfs_write_full_page(struct page *page,
 done:
 	if (nr == 0) {
 		/*
-		 * if this page only had a direct item, it is very possible for
+		 * if this folio only had a direct item, it is very possible for
 		 * no io to be required without there being an error.  Or,
 		 * someone else could have locked them and sent them down the
-		 * pipe without locking the page
+		 * pipe without locking the folio
 		 */
 		bh = head;
 		do {
@@ -2677,18 +2672,18 @@ done:
 			bh = bh->b_this_page;
 		} while (bh != head);
 		if (!partial)
-			SetPageUptodate(page);
-		end_page_writeback(page);
+			folio_mark_uptodate(folio);
+		folio_end_writeback(folio);
 	}
 	return error;
 
 fail:
 	/*
 	 * catches various errors, we need to make sure any valid dirty blocks
-	 * get to the media.  The page is currently locked and not marked for
+	 * get to the media.  The folio is currently locked and not marked for
 	 * writeback
 	 */
-	ClearPageUptodate(page);
+	folio_clear_uptodate(folio);
 	bh = head;
 	do {
 		get_bh(bh);
@@ -2698,16 +2693,16 @@ fail:
 		} else {
 			/*
 			 * clear any dirty bits that might have come from
-			 * getting attached to a dirty page
+			 * getting attached to a dirty folio
 			 */
 			clear_buffer_dirty(bh);
 		}
 		bh = bh->b_this_page;
 	} while (bh != head);
-	SetPageError(page);
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	unlock_page(page);
+	folio_set_error(folio);
+	BUG_ON(folio_test_writeback(folio));
+	folio_start_writeback(folio);
+	folio_unlock(folio);
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
@@ -2728,9 +2723,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio)
 
 static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
+	struct folio *folio = page_folio(page);
+	struct inode *inode = folio->mapping->host;
 	reiserfs_wait_on_write_block(inode->i_sb);
-	return reiserfs_write_full_page(page, wbc);
+	return reiserfs_write_full_folio(folio, wbc);
 }
 
 static void reiserfs_truncate_failed_write(struct inode *inode)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 015bfe4e4524..171c912af50f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s,
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *sb);
-static void release_journal_dev(struct super_block *super,
-			       struct reiserfs_journal *journal);
+static void release_journal_dev(struct reiserfs_journal *journal);
 static void dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
 static void flush_async_commits(struct work_struct *work);
@@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb)
 	 * j_header_bh is on the journal dev, make sure
 	 * not to release the journal dev until we brelse j_header_bh
 	 */
-	release_journal_dev(sb, journal);
+	release_journal_dev(journal);
 	vfree(journal);
 }
 
@@ -2387,7 +2386,7 @@ static int journal_read(struct super_block *sb)
 
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      journal->j_dev_bd);
+		      journal->j_bdev_handle->bdev);
 	start = ktime_get_seconds();
 
 	/*
@@ -2448,7 +2447,7 @@ static int journal_read(struct super_block *sb)
 		 * device and journal device to be the same
 		 */
 		d_bh =
-		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
+		    reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
 				    sb->s_blocksize,
 				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				    SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2587,17 +2586,11 @@ static void journal_list_init(struct super_block *sb)
 	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
-static void release_journal_dev(struct super_block *super,
-			       struct reiserfs_journal *journal)
+static void release_journal_dev(struct reiserfs_journal *journal)
 {
-	if (journal->j_dev_bd != NULL) {
-		void *holder = NULL;
-
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			holder = journal;
-
-		blkdev_put(journal->j_dev_bd, holder);
-		journal->j_dev_bd = NULL;
+	if (journal->j_bdev_handle) {
+		bdev_release(journal->j_bdev_handle);
+		journal->j_bdev_handle = NULL;
 	}
 }
 
@@ -2612,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
 
 	result = 0;
 
-	journal->j_dev_bd = NULL;
+	journal->j_bdev_handle = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
@@ -2623,36 +2616,37 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			holder = NULL;
-		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder,
-						      NULL);
-		if (IS_ERR(journal->j_dev_bd)) {
-			result = PTR_ERR(journal->j_dev_bd);
-			journal->j_dev_bd = NULL;
+		journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+							  holder, NULL);
+		if (IS_ERR(journal->j_bdev_handle)) {
+			result = PTR_ERR(journal->j_bdev_handle);
+			journal->j_bdev_handle = NULL;
 			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device unknown-block(%u,%u): %i",
 					 MAJOR(jdev), MINOR(jdev), result);
 			return result;
 		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_dev_bd, super->s_blocksize);
+			set_blocksize(journal->j_bdev_handle->bdev,
+				      super->s_blocksize);
 
 		return 0;
 	}
 
-	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder,
-					       NULL);
-	if (IS_ERR(journal->j_dev_bd)) {
-		result = PTR_ERR(journal->j_dev_bd);
-		journal->j_dev_bd = NULL;
+	journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+						   holder, NULL);
+	if (IS_ERR(journal->j_bdev_handle)) {
+		result = PTR_ERR(journal->j_bdev_handle);
+		journal->j_bdev_handle = NULL;
 		reiserfs_warning(super, "sh-457",
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
 		return result;
 	}
 
-	set_blocksize(journal->j_dev_bd, super->s_blocksize);
+	set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
 	reiserfs_info(super,
 		      "journal_init_dev: journal device: %pg\n",
-		      journal->j_dev_bd);
+		      journal->j_bdev_handle->bdev);
 	return 0;
 }
 
@@ -2810,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 				 "journal header magic %x (device %pg) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
-				 journal->j_dev_bd,
+				 journal->j_bdev_handle->bdev,
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
@@ -2834,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	reiserfs_info(sb, "journal params: device %pg, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
-		      journal->j_dev_bd,
+		      journal->j_bdev_handle->bdev,
 		      SB_ONDISK_JOURNAL_SIZE(sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9c5704be2435..994d6e6995ab 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 	}
 
 	dir->i_size += paste_size;
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	if (!S_ISDIR(inode->i_mode) && visible)
 		/* reiserfs_mkdir or reiserfs_rename will do that by itself */
 		reiserfs_update_sd(th, dir);
@@ -966,8 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 			       inode->i_nlink);
 
 	clear_nlink(inode);
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	reiserfs_update_sd(&th, inode);
 
 	DEC_DIR_INODE_NLINK(dir)
@@ -1075,7 +1075,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	reiserfs_update_sd(&th, inode);
 
 	dir->i_size -= (de.de_entrylen + DEH_SIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	reiserfs_update_sd(&th, dir);
 
 	if (!savelink)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 3dba8acf4e83..83cb9402e0f9 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "prepare: \t%12lu\n"
 		   "prepare_retry: \t%12lu\n",
 		   DJP(jp_journal_1st_block),
-		   SB_JOURNAL(sb)->j_dev_bd,
+		   SB_JOURNAL(sb)->j_bdev_handle->bdev,
 		   DJP(jp_journal_dev),
 		   DJP(jp_journal_size),
 		   DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 7d12b8c5b2fa..725667880e62 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
 	/* oldest journal block.  start here for traverse */
 	struct reiserfs_journal_cnode *j_first;
 
-	struct block_device *j_dev_bd;
+	struct bdev_handle *j_bdev_handle;
 
 	/* first block on s_dev of reserved area journal */
 	int j_1st_reserved_block;
@@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
 	return bmap_nr > ((1LL << 16) - 1);
 }
 
-extern const struct xattr_handler *reiserfs_xattr_handlers[];
+extern const struct xattr_handler * const reiserfs_xattr_handlers[];
 
 /*
  * this says about version of key of all items (but stat data) the
@@ -2809,9 +2809,12 @@ struct reiserfs_journal_header {
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
 
 /* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(\
+		SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		block, s->s_blocksize)
 
 enum reiserfs_bh_state_bits {
 	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 3676e02a0232..2138ee7d271d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2003,7 +2003,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
 			pathrelse(&s_search_path);
 
 			if (update_timestamps) {
-				inode->i_mtime = current_time(inode);
+				inode_set_mtime_to_ts(inode,
+						      current_time(inode));
 				inode_set_ctime_current(inode);
 			}
 			reiserfs_update_sd(th, inode);
@@ -2028,7 +2029,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
 update_and_out:
 	if (update_timestamps) {
 		/* this is truncate, not file closing */
-		inode->i_mtime = current_time(inode);
+		inode_set_mtime_to_ts(inode, current_time(inode));
 		inode_set_ctime_current(inode);
 	}
 	reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7eaf36b3de12..67b5510beded 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
 		return err;
 	if (inode->i_size < off + len - towrite)
 		i_size_write(inode, off + len - towrite);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return len - towrite;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6000964c2b80..998035a6388e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -780,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name,
 }
 
 /* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers,
+static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
 				       const char *name, struct dentry *dentry)
 {
 	if (handlers) {
@@ -911,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 
 /* Actual operations that are exported to VFS-land */
-const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler * const reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
 	&reiserfs_xattr_user_handler,
 	&reiserfs_xattr_trusted_handler,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 5c35f6c76037..545ad44f96b8 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,7 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 
 	set_nlink(i, 1);		/* Hard to decide.. */
 	i->i_size = be32_to_cpu(ri.size);
-	i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
+	inode_set_mtime_to_ts(i,
+			      inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0)));
 
 	/* set up mode and ops */
 	mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -593,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		blkdev_put(sb->s_bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 #endif
 }
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index fe1bf5b6e0cb..d64a306a414b 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -32,7 +32,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
 			 * fully cached or it may be in the process of
 			 * being deleted due to a lease break.
 			 */
-			if (!cfid->has_lease) {
+			if (!cfid->time || !cfid->has_lease) {
 				spin_unlock(&cfids->cfid_list_lock);
 				return NULL;
 			}
@@ -193,9 +193,19 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 	npath = path_no_prefix(cifs_sb, path);
 	if (IS_ERR(npath)) {
 		rc = PTR_ERR(npath);
-		kfree(utf16_path);
-		return rc;
+		goto out;
+	}
+
+	if (!npath[0]) {
+		dentry = dget(cifs_sb->root);
+	} else {
+		dentry = path_to_dentry(cifs_sb, npath);
+		if (IS_ERR(dentry)) {
+			rc = -ENOENT;
+			goto out;
+		}
 	}
+	cfid->dentry = dentry;
 
 	/*
 	 * We do not hold the lock for the open because in case
@@ -249,6 +259,15 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 
 	smb2_set_related(&rqst[1]);
 
+	/*
+	 * Set @cfid->has_lease to true before sending out compounded request so
+	 * its lease reference can be put in cached_dir_lease_break() due to a
+	 * potential lease break right after the request is sent or while @cfid
+	 * is still being cached.  Concurrent processes won't be to use it yet
+	 * due to @cfid->time being zero.
+	 */
+	cfid->has_lease = true;
+
 	rc = compound_send_recv(xid, ses, server,
 				flags, 2, rqst,
 				resp_buftype, rsp_iov);
@@ -263,6 +282,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 	cfid->tcon = tcon;
 	cfid->is_open = true;
 
+	spin_lock(&cfids->cfid_list_lock);
+
 	o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
 	oparms.fid->persistent_fid = o_rsp->PersistentFileId;
 	oparms.fid->volatile_fid = o_rsp->VolatileFileId;
@@ -270,18 +291,32 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 	oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
 #endif /* CIFS_DEBUG2 */
 
-	if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
+
+	if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) {
+		spin_unlock(&cfids->cfid_list_lock);
+		rc = -EINVAL;
+		goto oshr_free;
+	}
+
+	rc = smb2_parse_contexts(server, rsp_iov,
+				 &oparms.fid->epoch,
+				 oparms.fid->lease_key,
+				 &oplock, NULL, NULL);
+	if (rc) {
+		spin_unlock(&cfids->cfid_list_lock);
 		goto oshr_free;
+	}
 
-	smb2_parse_contexts(server, o_rsp,
-			    &oparms.fid->epoch,
-			    oparms.fid->lease_key, &oplock,
-			    NULL, NULL);
-	if (!(oplock & SMB2_LEASE_READ_CACHING_HE))
+	rc = -EINVAL;
+	if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) {
+		spin_unlock(&cfids->cfid_list_lock);
 		goto oshr_free;
+	}
 	qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
-	if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+	if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) {
+		spin_unlock(&cfids->cfid_list_lock);
 		goto oshr_free;
+	}
 	if (!smb2_validate_and_copy_iov(
 				le16_to_cpu(qi_rsp->OutputBufferOffset),
 				sizeof(struct smb2_file_all_info),
@@ -289,37 +324,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
 				(char *)&cfid->file_all_info))
 		cfid->file_all_info_is_valid = true;
 
-	if (!npath[0])
-		dentry = dget(cifs_sb->root);
-	else {
-		dentry = path_to_dentry(cifs_sb, npath);
-		if (IS_ERR(dentry)) {
-			rc = -ENOENT;
-			goto oshr_free;
-		}
-	}
-	spin_lock(&cfids->cfid_list_lock);
-	cfid->dentry = dentry;
 	cfid->time = jiffies;
-	cfid->has_lease = true;
 	spin_unlock(&cfids->cfid_list_lock);
+	/* At this point the directory handle is fully cached */
+	rc = 0;
 
 oshr_free:
-	kfree(utf16_path);
 	SMB2_open_free(&rqst[0]);
 	SMB2_query_info_free(&rqst[1]);
 	free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
 	free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
-	spin_lock(&cfids->cfid_list_lock);
-	if (!cfid->has_lease) {
-		if (rc) {
-			if (cfid->on_list) {
-				list_del(&cfid->entry);
-				cfid->on_list = false;
-				cfids->num_entries--;
-			}
-			rc = -ENOENT;
-		} else {
+	if (rc) {
+		spin_lock(&cfids->cfid_list_lock);
+		if (cfid->on_list) {
+			list_del(&cfid->entry);
+			cfid->on_list = false;
+			cfids->num_entries--;
+		}
+		if (cfid->has_lease) {
 			/*
 			 * We are guaranteed to have two references at this
 			 * point. One for the caller and one for a potential
@@ -327,25 +349,24 @@ oshr_free:
 			 * will be closed when the caller closes the cached
 			 * handle.
 			 */
+			cfid->has_lease = false;
 			spin_unlock(&cfids->cfid_list_lock);
 			kref_put(&cfid->refcount, smb2_close_cached_fid);
 			goto out;
 		}
+		spin_unlock(&cfids->cfid_list_lock);
 	}
-	spin_unlock(&cfids->cfid_list_lock);
+out:
 	if (rc) {
 		if (cfid->is_open)
 			SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
 				   cfid->fid.volatile_fid);
 		free_cached_dir(cfid);
-		cfid = NULL;
-	}
-out:
-	if (rc == 0) {
+	} else {
 		*ret_cfid = cfid;
 		atomic_inc(&tcon->num_remote_opens);
 	}
-
+	kfree(utf16_path);
 	return rc;
 }
 
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 76922fcc4bc6..60027f5aebe8 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -40,11 +40,13 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
 #ifdef CONFIG_CIFS_DEBUG2
 	struct smb_hdr *smb = buf;
 
-	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
-		 smb->Command, smb->Status.CifsError,
-		 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-	cifs_dbg(VFS, "smb buf %p len %u\n", smb,
-		 server->ops->calc_smb_size(smb));
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d Wct: %d\n",
+		 smb->Command, smb->Status.CifsError, smb->Flags,
+		 smb->Flags2, smb->Mid, smb->Pid, smb->WordCount);
+	if (!server->ops->check_message(buf, server->total_read, server)) {
+		cifs_dbg(VFS, "smb buf %p len %u\n", smb,
+			 server->ops->calc_smb_size(smb));
+	}
 #endif /* CONFIG_CIFS_DEBUG2 */
 }
 
@@ -136,6 +138,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
 {
 	struct TCP_Server_Info *server = chan->server;
 
+	if (!server) {
+		seq_printf(m, "\n\n\t\tChannel: %d DISABLED", i+1);
+		return;
+	}
+
 	seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx"
 		   "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x"
 		   "\n\t\tTCP status: %d Instance: %d"
@@ -279,6 +286,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon;
 	struct cifs_server_iface *iface;
+	size_t iface_weight = 0, iface_min_speed = 0;
+	struct cifs_server_iface *last_iface = NULL;
 	int c, i, j;
 
 	seq_puts(m,
@@ -427,6 +436,8 @@ skip_rdma:
 		if (server->nosharesock)
 			seq_printf(m, " nosharesock");
 
+		seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities);
+
 		if (server->rdma)
 			seq_printf(m, "\nRDMA ");
 		seq_printf(m, "\nTCP status: %d Instance: %d"
@@ -452,6 +463,11 @@ skip_rdma:
 		seq_printf(m, "\n\n\tSessions: ");
 		i = 0;
 		list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+			spin_lock(&ses->ses_lock);
+			if (ses->ses_status == SES_EXITING) {
+				spin_unlock(&ses->ses_lock);
+				continue;
+			}
 			i++;
 			if ((ses->serverDomain == NULL) ||
 				(ses->serverOS == NULL) ||
@@ -472,6 +488,7 @@ skip_rdma:
 				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->ses_status);
 			}
+			spin_unlock(&ses->ses_lock);
 
 			seq_printf(m, "\n\tSecurity type: %s ",
 				get_security_type_str(server->ops->select_sectype(server, ses->sectype)));
@@ -536,11 +553,25 @@ skip_rdma:
 					   "\tLast updated: %lu seconds ago",
 					   ses->iface_count,
 					   (jiffies - ses->iface_last_update) / HZ);
+
+			last_iface = list_last_entry(&ses->iface_list,
+						     struct cifs_server_iface,
+						     iface_head);
+			iface_min_speed = last_iface->speed;
+
 			j = 0;
 			list_for_each_entry(iface, &ses->iface_list,
 						 iface_head) {
 				seq_printf(m, "\n\t%d)", ++j);
 				cifs_dump_iface(m, iface);
+
+				iface_weight = iface->speed / iface_min_speed;
+				seq_printf(m, "\t\tWeight (cur,total): (%zu,%zu)"
+					   "\n\t\tAllocated channels: %u\n",
+					   iface->weight_fulfilled,
+					   iface_weight,
+					   iface->num_channels);
+
 				if (is_ses_using_iface(ses, iface))
 					seq_puts(m, "\t\t[CONNECTED]\n");
 			}
@@ -738,14 +769,14 @@ static ssize_t name##_write(struct file *file, const char __user *buffer, \
 	size_t count, loff_t *ppos) \
 { \
 	int rc; \
-	rc = kstrtoint_from_user(buffer, count, 10, & name); \
+	rc = kstrtoint_from_user(buffer, count, 10, &name); \
 	if (rc) \
 		return rc; \
 	return count; \
 } \
 static int name##_proc_show(struct seq_file *m, void *v) \
 { \
-	seq_printf(m, "%d\n", name ); \
+	seq_printf(m, "%d\n", name); \
 	return 0; \
 } \
 static int name##_open(struct inode *inode, struct file *file) \
diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h
index 332588e77c31..26327442e383 100644
--- a/fs/smb/client/cifs_ioctl.h
+++ b/fs/smb/client/cifs_ioctl.h
@@ -26,6 +26,11 @@ struct smb_mnt_fs_info {
 	__u64   cifs_posix_caps;
 } __packed;
 
+struct smb_mnt_tcon_info {
+	__u32	tid;
+	__u64	session_id;
+} __packed;
+
 struct smb_snapshot_array {
 	__u32	number_of_snapshots;
 	__u32	number_of_snapshots_returned;
@@ -108,6 +113,7 @@ struct smb3_notify_info {
 #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
 #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
 #define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
+#define CIFS_IOC_GET_TCON_INFO _IOR(CIFS_IOCTL_MAGIC, 12, struct smb_mnt_tcon_info)
 #define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
 
 /*
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 6f3285f1dfee..af7849e5974f 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* strlen of "host=" */
-#define HOST_KEY_LEN		5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN		6
 
 /* strlen of ";ip4=" or ";ip6=" */
 #define IP_KEY_LEN		5
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 22869cda1356..2131638f26d0 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1191,36 +1191,108 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
 
 const struct inode_operations cifs_symlink_inode_ops = {
 	.get_link = cifs_get_link,
+	.setattr = cifs_setattr,
 	.permission = cifs_permission,
 	.listxattr = cifs_listxattr,
 };
 
+/*
+ * Advance the EOF marker to after the source range.
+ */
+static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *src_cifsi,
+				struct cifs_tcon *src_tcon,
+				unsigned int xid, loff_t src_end)
+{
+	struct cifsFileInfo *writeable_srcfile;
+	int rc = -EINVAL;
+
+	writeable_srcfile = find_writable_file(src_cifsi, FIND_WR_FSUID_ONLY);
+	if (writeable_srcfile) {
+		if (src_tcon->ses->server->ops->set_file_size)
+			rc = src_tcon->ses->server->ops->set_file_size(
+				xid, src_tcon, writeable_srcfile,
+				src_inode->i_size, true /* no need to set sparse */);
+		else
+			rc = -ENOSYS;
+		cifsFileInfo_put(writeable_srcfile);
+		cifs_dbg(FYI, "SetFSize for copychunk rc = %d\n", rc);
+	}
+
+	if (rc < 0)
+		goto set_failed;
+
+	netfs_resize_file(&src_cifsi->netfs, src_end);
+	fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
+	return 0;
+
+set_failed:
+	return filemap_write_and_wait(src_inode->i_mapping);
+}
+
+/*
+ * Flush out either the folio that overlaps the beginning of a range in which
+ * pos resides or the folio that overlaps the end of a range unless that folio
+ * is entirely within the range we're going to invalidate.  We extend the flush
+ * bounds to encompass the folio.
+ */
+static int cifs_flush_folio(struct inode *inode, loff_t pos, loff_t *_fstart, loff_t *_fend,
+			    bool first)
+{
+	struct folio *folio;
+	unsigned long long fpos, fend;
+	pgoff_t index = pos / PAGE_SIZE;
+	size_t size;
+	int rc = 0;
+
+	folio = filemap_get_folio(inode->i_mapping, index);
+	if (IS_ERR(folio))
+		return 0;
+
+	size = folio_size(folio);
+	fpos = folio_pos(folio);
+	fend = fpos + size - 1;
+	*_fstart = min_t(unsigned long long, *_fstart, fpos);
+	*_fend   = max_t(unsigned long long, *_fend, fend);
+	if ((first && pos == fpos) || (!first && pos == fend))
+		goto out;
+
+	rc = filemap_write_and_wait_range(inode->i_mapping, fpos, fend);
+out:
+	folio_put(folio);
+	return rc;
+}
+
 static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
 		struct file *dst_file, loff_t destoff, loff_t len,
 		unsigned int remap_flags)
 {
 	struct inode *src_inode = file_inode(src_file);
 	struct inode *target_inode = file_inode(dst_file);
+	struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+	struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
 	struct cifsFileInfo *smb_file_src = src_file->private_data;
-	struct cifsFileInfo *smb_file_target;
-	struct cifs_tcon *target_tcon;
+	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct cifs_tcon *target_tcon, *src_tcon;
+	unsigned long long destend, fstart, fend, new_size;
 	unsigned int xid;
 	int rc;
 
-	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+	if (remap_flags & REMAP_FILE_DEDUP)
+		return -EOPNOTSUPP;
+	if (remap_flags & ~REMAP_FILE_ADVISORY)
 		return -EINVAL;
 
 	cifs_dbg(FYI, "clone range\n");
 
 	xid = get_xid();
 
-	if (!src_file->private_data || !dst_file->private_data) {
+	if (!smb_file_src || !smb_file_target) {
 		rc = -EBADF;
 		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
 		goto out;
 	}
 
-	smb_file_target = dst_file->private_data;
+	src_tcon = tlink_tcon(smb_file_src->tlink);
 	target_tcon = tlink_tcon(smb_file_target->tlink);
 
 	/*
@@ -1233,20 +1305,63 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
 	if (len == 0)
 		len = src_inode->i_size - off;
 
-	cifs_dbg(FYI, "about to flush pages\n");
-	/* should we flush first and last page first */
-	truncate_inode_pages_range(&target_inode->i_data, destoff,
-				   PAGE_ALIGN(destoff + len)-1);
+	cifs_dbg(FYI, "clone range\n");
+
+	/* Flush the source buffer */
+	rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
+					  off + len - 1);
+	if (rc)
+		goto unlock;
+
+	/* The server-side copy will fail if the source crosses the EOF marker.
+	 * Advance the EOF marker after the flush above to the end of the range
+	 * if it's short of that.
+	 */
+	if (src_cifsi->netfs.remote_i_size < off + len) {
+		rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
+		if (rc < 0)
+			goto unlock;
+	}
+
+	new_size = destoff + len;
+	destend = destoff + len - 1;
+
+	/* Flush the folios at either end of the destination range to prevent
+	 * accidental loss of dirty data outside of the range.
+	 */
+	fstart = destoff;
+	fend = destend;
 
-	if (target_tcon->ses->server->ops->duplicate_extents)
+	rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+	if (rc)
+		goto unlock;
+	rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
+	if (rc)
+		goto unlock;
+
+	/* Discard all the folios that overlap the destination region. */
+	cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend);
+	truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
+
+	fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+			   i_size_read(target_inode), 0);
+
+	rc = -EOPNOTSUPP;
+	if (target_tcon->ses->server->ops->duplicate_extents) {
 		rc = target_tcon->ses->server->ops->duplicate_extents(xid,
 			smb_file_src, smb_file_target, off, len, destoff);
-	else
-		rc = -EOPNOTSUPP;
+		if (rc == 0 && new_size > i_size_read(target_inode)) {
+			truncate_setsize(target_inode, new_size);
+			netfs_resize_file(&target_cifsi->netfs, new_size);
+			fscache_resize_cookie(cifs_inode_cookie(target_inode),
+					      new_size);
+		}
+	}
 
 	/* force revalidate of size and timestamps of target file now
 	   that target is updated on the server */
 	CIFS_I(target_inode)->time = 0;
+unlock:
 	/* although unlocking in the reverse order from locking is not
 	   strictly necessary here it is a little cleaner to be consistent */
 	unlock_two_nondirectories(src_inode, target_inode);
@@ -1262,10 +1377,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 {
 	struct inode *src_inode = file_inode(src_file);
 	struct inode *target_inode = file_inode(dst_file);
+	struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
 	struct cifsFileInfo *smb_file_src;
 	struct cifsFileInfo *smb_file_target;
 	struct cifs_tcon *src_tcon;
 	struct cifs_tcon *target_tcon;
+	unsigned long long destend, fstart, fend;
 	ssize_t rc;
 
 	cifs_dbg(FYI, "copychunk range\n");
@@ -1305,13 +1422,41 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	if (rc)
 		goto unlock;
 
-	/* should we flush first and last page first */
-	truncate_inode_pages(&target_inode->i_data, 0);
+	/* The server-side copy will fail if the source crosses the EOF marker.
+	 * Advance the EOF marker after the flush above to the end of the range
+	 * if it's short of that.
+	 */
+	if (src_cifsi->server_eof < off + len) {
+		rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
+		if (rc < 0)
+			goto unlock;
+	}
+
+	destend = destoff + len - 1;
+
+	/* Flush the folios at either end of the destination range to prevent
+	 * accidental loss of dirty data outside of the range.
+	 */
+	fstart = destoff;
+	fend = destend;
+
+	rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+	if (rc)
+		goto unlock;
+	rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
+	if (rc)
+		goto unlock;
+
+	/* Discard all the folios that overlap the destination region. */
+	truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
 
 	rc = file_modified(dst_file);
-	if (!rc)
+	if (!rc) {
 		rc = target_tcon->ses->server->ops->copychunk_range(xid,
 			smb_file_src, smb_file_target, off, len, destoff);
+		if (rc > 0 && destoff + rc > i_size_read(target_inode))
+			truncate_setsize(target_inode, destoff + rc);
+	}
 
 	file_accessed(src_file);
 
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 41daebd220ff..3adea10aa9da 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -127,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
 			struct dentry *direntry, const char *symname);
 
 #ifdef CONFIG_CIFS_XATTR
-extern const struct xattr_handler *cifs_xattr_handlers[];
+extern const struct xattr_handler * const cifs_xattr_handlers[];
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 #else
 # define cifs_xattr_handlers NULL
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 /* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 45
-#define CIFS_VERSION   "2.45"
+#define SMB3_PRODUCT_BUILD 46
+#define CIFS_VERSION   "2.46"
 #endif				/* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 02082621d8e0..5e32c79f03a7 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -191,7 +191,13 @@ struct cifs_open_info_data {
 		bool reparse_point;
 		bool symlink;
 	};
-	__u32 reparse_tag;
+	struct {
+		__u32 tag;
+		union {
+			struct reparse_data_buffer *buf;
+			struct reparse_posix_data *posix;
+		};
+	} reparse;
 	char *symlink_target;
 	union {
 		struct smb2_file_all_info fi;
@@ -395,8 +401,7 @@ struct smb_version_operations {
 			     struct cifs_tcon *tcon,
 			     struct cifs_sb_info *cifs_sb,
 			     const char *full_path,
-			     char **target_path,
-			     struct kvec *rsp_iov);
+			     char **target_path);
 	/* open a file for non-posix mounts */
 	int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
 		    void *buf);
@@ -527,7 +532,8 @@ struct smb_version_operations {
 				 struct mid_q_entry **, char **, int *);
 	enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
 			    enum securityEnum);
-	int (*next_header)(char *);
+	int (*next_header)(struct TCP_Server_Info *server, char *buf,
+			   unsigned int *noff);
 	/* ioctl passthrough for query_info */
 	int (*ioctl_query_info)(const unsigned int xid,
 				struct cifs_tcon *tcon,
@@ -551,6 +557,9 @@ struct smb_version_operations {
 	bool (*is_status_io_timeout)(char *buf);
 	/* Check for STATUS_NETWORK_NAME_DELETED */
 	bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
+	int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
+				   struct kvec *rsp_iov,
+				   struct cifs_open_info_data *data);
 };
 
 struct smb_version_values {
@@ -650,6 +659,7 @@ struct TCP_Server_Info {
 	bool noautotune;		/* do not autotune send buf sizes */
 	bool nosharesock;
 	bool tcp_nodelay;
+	bool terminate;
 	unsigned int credits;  /* send no more requests at once */
 	unsigned int max_credits; /* can override large 32000 default at mnt */
 	unsigned int in_flight;  /* number of requests on the wire to server */
@@ -969,6 +979,8 @@ struct cifs_server_iface {
 	struct list_head iface_head;
 	struct kref refcount;
 	size_t speed;
+	size_t weight_fulfilled;
+	unsigned int num_channels;
 	unsigned int rdma_capable : 1;
 	unsigned int rss_capable : 1;
 	unsigned int is_active : 1; /* unset if non existent */
@@ -982,7 +994,6 @@ release_iface(struct kref *ref)
 	struct cifs_server_iface *iface = container_of(ref,
 						       struct cifs_server_iface,
 						       refcount);
-	list_del_init(&iface->iface_head);
 	kfree(iface);
 }
 
@@ -1050,6 +1061,7 @@ struct cifs_ses {
 	spinlock_t chan_lock;
 	/* ========= begin: protected by chan_lock ======== */
 #define CIFS_MAX_CHANNELS 16
+#define CIFS_INVAL_CHAN_INDEX (-1)
 #define CIFS_ALL_CHANNELS_SET(ses)	\
 	((1UL << (ses)->chan_count) - 1)
 #define CIFS_ALL_CHANS_GOOD(ses)		\
@@ -2143,6 +2155,7 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
 	unsigned int len, skip;
 	unsigned int nents = 0;
 	unsigned long addr;
+	size_t data_size;
 	int i, j;
 
 	/*
@@ -2158,17 +2171,21 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
 	 * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
 	 */
 	for (i = 0; i < num_rqst; i++) {
+		data_size = iov_iter_count(&rqst[i].rq_iter);
+
 		/* We really don't want a mixture of pinned and unpinned pages
 		 * in the sglist.  It's hard to keep track of which is what.
 		 * Instead, we convert to a BVEC-type iterator higher up.
 		 */
-		if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
+		if (data_size &&
+		    WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
 			return -EIO;
 
 		/* We also don't want to have any extra refs or pins to clean
 		 * up in the sglist.
 		 */
-		if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
+		if (data_size &&
+		    WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
 			return -EIO;
 
 		for (j = 0; j < rqst[i].rq_nvec; j++) {
@@ -2184,7 +2201,8 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
 			}
 			skip = 0;
 		}
-		nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
+		if (data_size)
+			nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
 	}
 	nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
 	return nents;
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index e17222fec9d2..c0513fbb8a59 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -882,11 +882,13 @@ typedef struct smb_com_open_rsp {
 	__u8 OplockLevel;
 	__u16 Fid;
 	__le32 CreateAction;
-	__le64 CreationTime;
-	__le64 LastAccessTime;
-	__le64 LastWriteTime;
-	__le64 ChangeTime;
-	__le32 FileAttributes;
+	struct_group(common_attributes,
+		__le64 CreationTime;
+		__le64 LastAccessTime;
+		__le64 LastWriteTime;
+		__le64 ChangeTime;
+		__le32 FileAttributes;
+	);
 	__le64 AllocationSize;
 	__le64 EndOfFile;
 	__le16 FileType;
@@ -1356,7 +1358,7 @@ typedef struct smb_com_transaction_ioctl_rsp {
 	__le32 DataDisplacement;
 	__u8 SetupCount;	/* 1 */
 	__le16 ReturnedDataLen;
-	__u16 ByteCount;
+	__le16 ByteCount;
 } __attribute__((packed)) TRANSACT_IOCTL_RSP;
 
 #define CIFS_ACL_OWNER 1
@@ -1509,7 +1511,7 @@ struct reparse_posix_data {
 	__le16	ReparseDataLength;
 	__u16	Reserved;
 	__le64	InodeType; /* LNK, FIFO, CHR etc. */
-	char	PathBuffer[];
+	__u8	DataBuffer[];
 } __attribute__((packed));
 
 struct cifs_quota_data {
@@ -2264,11 +2266,13 @@ typedef struct {
 /* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */
 /******************************************************************************/
 typedef struct { /* data block encoding of response to level 263 QPathInfo */
-	__le64 CreationTime;
-	__le64 LastAccessTime;
-	__le64 LastWriteTime;
-	__le64 ChangeTime;
-	__le32 Attributes;
+	struct_group(common_attributes,
+		__le64 CreationTime;
+		__le64 LastAccessTime;
+		__le64 LastWriteTime;
+		__le64 ChangeTime;
+		__le32 Attributes;
+	);
 	__u32 Pad1;
 	__le64 AllocationSize;
 	__le64 EndOfFile;	/* size ie offset to first free byte in file */
@@ -2570,7 +2574,7 @@ typedef struct {
 
 
 struct win_dev {
-	unsigned char type[8]; /* IntxCHR or IntxBLK */
+	unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/
 	__le64 major;
 	__le64 minor;
 } __attribute__((packed));
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 0c37eefa18a5..46feaa0880bd 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -81,7 +81,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 char *cifs_build_devname(char *nodename, const char *prepath);
 extern void delete_mid(struct mid_q_entry *mid);
-extern void release_mid(struct mid_q_entry *mid);
+void __release_mid(struct kref *refcount);
 extern void cifs_wake_up_task(struct mid_q_entry *mid);
 extern int cifs_handle_standard(struct TCP_Server_Info *server,
 				struct mid_q_entry *mid);
@@ -132,6 +132,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct smb_hdr *in_buf,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
+
 void
 cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
 				      bool all_channels);
@@ -209,7 +210,7 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
 			const struct cifs_fid *fid);
 bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 				 struct cifs_fattr *fattr,
-				 u32 tag);
+				 struct cifs_open_info_data *data);
 extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
 			struct super_block *sb, unsigned int xid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
@@ -457,6 +458,12 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
 			struct cifs_tcon *tcon,
 			const unsigned char *searchName, char **syminfo,
 			const struct nls_table *nls_codepage, int remap);
+extern int cifs_query_reparse_point(const unsigned int xid,
+				    struct cifs_tcon *tcon,
+				    struct cifs_sb_info *cifs_sb,
+				    const char *full_path,
+				    u32 *tag, struct kvec *rsp,
+				    int *rsp_buftype);
 extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 			       __u16 fid, char **symlinkinfo,
 			       const struct nls_table *nls_codepage);
@@ -610,13 +617,13 @@ void cifs_free_hash(struct shash_desc **sdesc);
 
 struct cifs_chan *
 cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
+int cifs_try_adding_channels(struct cifs_ses *ses);
 bool is_server_using_iface(struct TCP_Server_Info *server,
 			   struct cifs_server_iface *iface);
 bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
 void cifs_ses_mark_for_reconnect(struct cifs_ses *ses);
 
-unsigned int
+int
 cifs_ses_get_chan_index(struct cifs_ses *ses,
 			struct TCP_Server_Info *server);
 void
@@ -640,6 +647,8 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
 bool
 cifs_chan_is_iface_active(struct cifs_ses *ses,
 			  struct TCP_Server_Info *server);
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses);
 int
 cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
 int
@@ -656,6 +665,12 @@ void cifs_put_tcp_super(struct super_block *sb);
 int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix);
 char *extract_hostname(const char *unc);
 char *extract_sharename(const char *unc);
+int parse_reparse_point(struct reparse_data_buffer *buf,
+			u32 plen, struct cifs_sb_info *cifs_sb,
+			bool unicode, struct cifs_open_info_data *data);
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev);
 
 #ifdef CONFIG_CIFS_DFS_UPCALL
 static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
@@ -740,4 +755,9 @@ static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
 	return true;
 }
 
+static inline void release_mid(struct mid_q_entry *mid)
+{
+	kref_put(&mid->refcount, __release_mid);
+}
+
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 25503f1a4fd2..9ee348e6d106 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1244,8 +1244,10 @@ openRetry:
 		*oplock |= CIFS_CREATE_ACTION;
 
 	if (buf) {
-		/* copy from CreationTime to Attributes */
-		memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+		/* copy commonly used attributes */
+		memcpy(&buf->common_attributes,
+		       &rsp->common_attributes,
+		       sizeof(buf->common_attributes));
 		/* the file_info buf is endian converted by caller */
 		buf->AllocationSize = rsp->AllocationSize;
 		buf->EndOfFile = rsp->EndOfFile;
@@ -2690,136 +2692,97 @@ querySymLinkRetry:
 	return rc;
 }
 
-/*
- *	Recent Windows versions now create symlinks more frequently
- *	and they use the "reparse point" mechanism below.  We can of course
- *	do symlinks nicely to Samba and other servers which support the
- *	CIFS Unix Extensions and we can also do SFU symlinks and "client only"
- *	"MF" symlinks optionally, but for recent Windows we really need to
- *	reenable the code below and fix the cifs_symlink callers to handle this.
- *	In the interim this code has been moved to its own config option so
- *	it is not compiled in by default until callers fixed up and more tested.
- */
-int
-CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
-		    __u16 fid, char **symlinkinfo,
-		    const struct nls_table *nls_codepage)
+int cifs_query_reparse_point(const unsigned int xid,
+			     struct cifs_tcon *tcon,
+			     struct cifs_sb_info *cifs_sb,
+			     const char *full_path,
+			     u32 *tag, struct kvec *rsp,
+			     int *rsp_buftype)
 {
-	int rc = 0;
-	int bytes_returned;
-	struct smb_com_transaction_ioctl_req *pSMB;
-	struct smb_com_transaction_ioctl_rsp *pSMBr;
-	bool is_unicode;
-	unsigned int sub_len;
-	char *sub_start;
-	struct reparse_symlink_data *reparse_buf;
-	struct reparse_posix_data *posix_buf;
+	struct cifs_open_parms oparms;
+	TRANSACT_IOCTL_REQ *io_req = NULL;
+	TRANSACT_IOCTL_RSP *io_rsp = NULL;
+	struct cifs_fid fid;
 	__u32 data_offset, data_count;
-	char *end_of_smb;
+	__u8 *start, *end;
+	int io_rsp_len;
+	int oplock = 0;
+	int rc;
 
-	cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
-	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
+	cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
+
+	if (cap_unix(tcon->ses))
+		return -EOPNOTSUPP;
+
+	oparms = (struct cifs_open_parms) {
+		.tcon = tcon,
+		.cifs_sb = cifs_sb,
+		.desired_access = FILE_READ_ATTRIBUTES,
+		.create_options = cifs_create_options(cifs_sb,
+						      OPEN_REPARSE_POINT),
+		.disposition = FILE_OPEN,
+		.path = full_path,
+		.fid = &fid,
+	};
+
+	rc = CIFS_open(xid, &oparms, &oplock, NULL);
 	if (rc)
 		return rc;
 
-	pSMB->TotalParameterCount = 0 ;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le32(2);
-	/* BB find exact data count max from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
-	pSMB->MaxSetupCount = 4;
-	pSMB->Reserved = 0;
-	pSMB->ParameterOffset = 0;
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 4;
-	pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
-	pSMB->IsFsctl = 1; /* FSCTL */
-	pSMB->IsRootFlag = 0;
-	pSMB->Fid = fid; /* file handle always le */
-	pSMB->ByteCount = 0;
+	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon,
+		      (void **)&io_req, (void **)&io_rsp);
+	if (rc)
+		goto error;
 
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	if (rc) {
-		cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
-		goto qreparse_out;
-	}
+	io_req->TotalParameterCount = 0;
+	io_req->TotalDataCount = 0;
+	io_req->MaxParameterCount = cpu_to_le32(2);
+	/* BB find exact data count max from sess structure BB */
+	io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
+	io_req->MaxSetupCount = 4;
+	io_req->Reserved = 0;
+	io_req->ParameterOffset = 0;
+	io_req->DataCount = 0;
+	io_req->DataOffset = 0;
+	io_req->SetupCount = 4;
+	io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+	io_req->ParameterCount = io_req->TotalParameterCount;
+	io_req->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT);
+	io_req->IsFsctl = 1;
+	io_req->IsRootFlag = 0;
+	io_req->Fid = fid.netfid;
+	io_req->ByteCount = 0;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)io_req,
+			 (struct smb_hdr *)io_rsp, &io_rsp_len, 0);
+	if (rc)
+		goto error;
 
-	data_offset = le32_to_cpu(pSMBr->DataOffset);
-	data_count = le32_to_cpu(pSMBr->DataCount);
-	if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
-		/* BB also check enough total bytes returned */
-		rc = -EIO;	/* bad smb */
-		goto qreparse_out;
-	}
-	if (!data_count || (data_count > 2048)) {
+	data_offset = le32_to_cpu(io_rsp->DataOffset);
+	data_count = le32_to_cpu(io_rsp->DataCount);
+	if (get_bcc(&io_rsp->hdr) < 2 || data_offset > 512 ||
+	    !data_count || data_count > 2048) {
 		rc = -EIO;
-		cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
-		goto qreparse_out;
-	}
-	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
-	reparse_buf = (struct reparse_symlink_data *)
-				((char *)&pSMBr->hdr.Protocol + data_offset);
-	if ((char *)reparse_buf >= end_of_smb) {
-		rc = -EIO;
-		goto qreparse_out;
-	}
-	if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
-		cifs_dbg(FYI, "NFS style reparse tag\n");
-		posix_buf =  (struct reparse_posix_data *)reparse_buf;
-
-		if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
-			cifs_dbg(FYI, "unsupported file type 0x%llx\n",
-				 le64_to_cpu(posix_buf->InodeType));
-			rc = -EOPNOTSUPP;
-			goto qreparse_out;
-		}
-		is_unicode = true;
-		sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
-		if (posix_buf->PathBuffer + sub_len > end_of_smb) {
-			cifs_dbg(FYI, "reparse buf beyond SMB\n");
-			rc = -EIO;
-			goto qreparse_out;
-		}
-		*symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
-				sub_len, is_unicode, nls_codepage);
-		goto qreparse_out;
-	} else if (reparse_buf->ReparseTag !=
-			cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
-		rc = -EOPNOTSUPP;
-		goto qreparse_out;
+		goto error;
 	}
 
-	/* Reparse tag is NTFS symlink */
-	sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
-				reparse_buf->PathBuffer;
-	sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
-	if (sub_start + sub_len > end_of_smb) {
-		cifs_dbg(FYI, "reparse buf beyond SMB\n");
+	end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount;
+	start = (__u8 *)&io_rsp->hdr.Protocol + data_offset;
+	if (start >= end) {
 		rc = -EIO;
-		goto qreparse_out;
+		goto error;
 	}
-	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
-		is_unicode = true;
-	else
-		is_unicode = false;
-
-	/* BB FIXME investigate remapping reserved chars here */
-	*symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
-					       nls_codepage);
-	if (!*symlinkinfo)
-		rc = -ENOMEM;
-qreparse_out:
-	cifs_buf_release(pSMB);
 
-	/*
-	 * Note: On -EAGAIN error only caller can retry on handle based calls
-	 * since file handle passed in no longer valid.
-	 */
+	*tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag);
+	rsp->iov_base = io_rsp;
+	rsp->iov_len = io_rsp_len;
+	*rsp_buftype = CIFS_LARGE_BUFFER;
+	CIFSSMBClose(xid, tcon, fid.netfid);
+	return 0;
+
+error:
+	cifs_buf_release(io_req);
+	CIFSSMBClose(xid, tcon, fid.netfid);
 	return rc;
 }
 
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 7b923e36501b..dc9b95ca71e6 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -119,6 +119,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
 static void smb2_query_server_interfaces(struct work_struct *work)
 {
 	int rc;
+	int xid;
 	struct cifs_tcon *tcon = container_of(work,
 					struct cifs_tcon,
 					query_interfaces.work);
@@ -126,8 +127,14 @@ static void smb2_query_server_interfaces(struct work_struct *work)
 	/*
 	 * query server network interfaces, in case they change
 	 */
-	rc = SMB3_request_interfaces(0, tcon, false);
+	xid = get_xid();
+	rc = SMB3_request_interfaces(xid, tcon, false);
+	free_xid(xid);
+
 	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return;
+
 		cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
 				__func__, rc);
 	}
@@ -156,20 +163,25 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
 	/* If server is a channel, select the primary channel */
 	pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
 
-	spin_lock(&pserver->srv_lock);
+	/* if we need to signal just this channel */
 	if (!all_channels) {
-		pserver->tcpStatus = CifsNeedReconnect;
-		spin_unlock(&pserver->srv_lock);
+		spin_lock(&server->srv_lock);
+		if (server->tcpStatus != CifsExiting)
+			server->tcpStatus = CifsNeedReconnect;
+		spin_unlock(&server->srv_lock);
 		return;
 	}
-	spin_unlock(&pserver->srv_lock);
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
 		spin_lock(&ses->chan_lock);
 		for (i = 0; i < ses->chan_count; i++) {
+			if (!ses->chans[i].server)
+				continue;
+
 			spin_lock(&ses->chans[i].server->srv_lock);
-			ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+			if (ses->chans[i].server->tcpStatus != CifsExiting)
+				ses->chans[i].server->tcpStatus = CifsNeedReconnect;
 			spin_unlock(&ses->chans[i].server->srv_lock);
 		}
 		spin_unlock(&ses->chan_lock);
@@ -204,14 +216,29 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
 	/* If server is a channel, select the primary channel */
 	pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
 
+	/*
+	 * if the server has been marked for termination, there is a
+	 * chance that the remaining channels all need reconnect. To be
+	 * on the safer side, mark the session and trees for reconnect
+	 * for this scenario. This might cause a few redundant session
+	 * setup and tree connect requests, but it is better than not doing
+	 * a tree connect when needed, and all following requests failing
+	 */
+	if (server->terminate) {
+		mark_smb_session = true;
+		server = pserver;
+	}
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
 		/* check if iface is still active */
-		if (!cifs_chan_is_iface_active(ses, server))
+		spin_lock(&ses->chan_lock);
+		if (!cifs_chan_is_iface_active(ses, server)) {
+			spin_unlock(&ses->chan_lock);
 			cifs_chan_update_iface(ses, server);
+			spin_lock(&ses->chan_lock);
+		}
 
-		spin_lock(&ses->chan_lock);
 		if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) {
 			spin_unlock(&ses->chan_lock);
 			continue;
@@ -241,6 +268,8 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
 			spin_lock(&tcon->tc_lock);
 			tcon->status = TID_NEED_RECON;
 			spin_unlock(&tcon->tc_lock);
+
+			cancel_delayed_work(&tcon->query_interfaces);
 		}
 		if (ses->tcon_ipc) {
 			ses->tcon_ipc->need_reconnect = true;
@@ -1179,7 +1208,12 @@ next_pdu:
 		server->total_read += length;
 
 		if (server->ops->next_header) {
-			next_offset = server->ops->next_header(buf);
+			if (server->ops->next_header(server, buf, &next_offset)) {
+				cifs_dbg(VFS, "%s: malformed response (next_offset=%u)\n",
+					 __func__, next_offset);
+				cifs_reconnect(server, true);
+				continue;
+			}
 			if (next_offset)
 				server->pdu_size = next_offset;
 		}
@@ -1586,10 +1620,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
 	list_del_init(&server->tcp_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	/* For secondary channels, we pick up ref-count on the primary server */
-	if (SERVER_IS_CHAN(server))
-		cifs_put_tcp_session(server->primary_server, from_reconnect);
-
 	cancel_delayed_work_sync(&server->echo);
 
 	if (from_reconnect)
@@ -1603,6 +1633,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
 	else
 		cancel_delayed_work_sync(&server->reconnect);
 
+	/* For secondary channels, we pick up ref-count on the primary server */
+	if (SERVER_IS_CHAN(server))
+		cifs_put_tcp_session(server->primary_server, from_reconnect);
+
 	spin_lock(&server->srv_lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&server->srv_lock);
@@ -1969,9 +2003,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 
 void __cifs_put_smb_ses(struct cifs_ses *ses)
 {
-	unsigned int rc, xid;
-	unsigned int chan_count;
 	struct TCP_Server_Info *server = ses->server;
+	unsigned int xid;
+	size_t i;
+	int rc;
 
 	spin_lock(&ses->ses_lock);
 	if (ses->ses_status == SES_EXITING) {
@@ -2017,20 +2052,20 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
 	list_del_init(&ses->smb_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	chan_count = ses->chan_count;
-
 	/* close any extra channels */
-	if (chan_count > 1) {
-		int i;
-
-		for (i = 1; i < chan_count; i++) {
-			if (ses->chans[i].iface) {
-				kref_put(&ses->chans[i].iface->refcount, release_iface);
-				ses->chans[i].iface = NULL;
-			}
-			cifs_put_tcp_session(ses->chans[i].server, 0);
-			ses->chans[i].server = NULL;
+	for (i = 1; i < ses->chan_count; i++) {
+		if (ses->chans[i].iface) {
+			kref_put(&ses->chans[i].iface->refcount, release_iface);
+			ses->chans[i].iface = NULL;
 		}
+		cifs_put_tcp_session(ses->chans[i].server, 0);
+		ses->chans[i].server = NULL;
+	}
+
+	/* we now account for primary channel in iface->refcount */
+	if (ses->chans[0].iface) {
+		kref_put(&ses->chans[0].iface->refcount, release_iface);
+		ses->chans[0].server = NULL;
 	}
 
 	sesInfoFree(ses);
@@ -3560,7 +3595,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
 	ctx->prepath = NULL;
 
 out:
-	cifs_try_adding_channels(cifs_sb, mnt_ctx.ses);
+	cifs_try_adding_channels(mnt_ctx.ses);
 	rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
 	if (rc)
 		goto error;
@@ -3849,8 +3884,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 	is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
 	spin_unlock(&ses->chan_lock);
 
-	if (!is_binding)
+	if (!is_binding) {
 		ses->ses_status = SES_IN_SETUP;
+
+		/* force iface_list refresh */
+		ses->iface_last_update = 0;
+	}
 	spin_unlock(&ses->ses_lock);
 
 	/* update ses ip_addr only for primary chan */
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 81b84151450d..a8a1d386da65 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -263,15 +263,23 @@ out:
 	return rc;
 }
 
-/* Resolve UNC hostname in @ctx->source and set ip addr in @ctx->dstaddr */
+/*
+ * If @ctx->dfs_automount, then update @ctx->dstaddr earlier with the DFS root
+ * server from where we'll start following any referrals.  Otherwise rely on the
+ * value provided by mount(2) as the user might not have dns_resolver key set up
+ * and therefore failing to upcall to resolve UNC hostname under @ctx->source.
+ */
 static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
 {
 	struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
-	int rc;
+	int rc = 0;
 
-	rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
-	if (!rc)
-		cifs_set_port(addr, ctx->port);
+	if (!ctx->nodfs && ctx->dfs_automount) {
+		rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
+		if (!rc)
+			cifs_set_port(addr, ctx->port);
+		ctx->dfs_automount = false;
+	}
 	return rc;
 }
 
diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c
index 37c28415df1e..d606e8cbcb7d 100644
--- a/fs/smb/client/export.c
+++ b/fs/smb/client/export.c
@@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry)
 }
 
 const struct export_operations cifs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.get_parent = cifs_get_parent,
-/*	Following five export operations are unneeded so far and can default:
-	.get_dentry =
-	.get_name =
-	.find_exported_dentry =
-	.decode_fh =
-	.encode_fs =  */
+/*
+ * Following export operations are mandatory for NFS export support:
+ *	.fh_to_dentry =
+ */
 };
 
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 2108b3b40ce9..32a8525415d9 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file)
 		    !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
 		    dclose) {
 			if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
-				inode->i_mtime = inode_set_ctime_current(inode);
+				inode_set_mtime_to_ts(inode,
+						      inode_set_ctime_current(inode));
 			}
 			spin_lock(&cinode->deferred_lock);
 			cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 					   write_data, to - from, &offset);
 		cifsFileInfo_put(open_file);
 		/* Does mm or vfs already set times? */
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 		if ((bytes_written > 0) && (offset))
 			rc = 0;
 		else if (bytes_written < 0)
@@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl)
 static int cifs_readpage_worker(struct file *file, struct page *page,
 	loff_t *poffset)
 {
+	struct inode *inode = file_inode(file);
+	struct timespec64 atime, mtime;
 	char *read_data;
 	int rc;
 
 	/* Is the page cached? */
-	rc = cifs_readpage_from_fscache(file_inode(file), page);
+	rc = cifs_readpage_from_fscache(inode, page);
 	if (rc == 0)
 		goto read_complete;
 
@@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 		cifs_dbg(FYI, "Bytes read %d\n", rc);
 
 	/* we do not want atime to be less than mtime, it broke some apps */
-	file_inode(file)->i_atime = current_time(file_inode(file));
-	if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
-		file_inode(file)->i_atime = file_inode(file)->i_mtime;
-	else
-		file_inode(file)->i_atime = current_time(file_inode(file));
+	atime = inode_set_atime_to_ts(inode, current_time(inode));
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&atime, &mtime) < 0)
+		inode_set_atime_to_ts(inode, inode_get_mtime(inode));
 
 	if (PAGE_SIZE > rc)
 		memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 9d8d34af0211..cf46916286d0 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -268,6 +268,7 @@ struct smb3_fs_context {
 	bool witness:1; /* use witness protocol */
 	char *leaf_fullpath;
 	struct cifs_ses *dfs_root_ses;
+	bool dfs_automount:1; /* set for dfs automount only */
 };
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 84f3b09367d2..a3d73720914f 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -49,12 +49,12 @@ static inline
 void cifs_fscache_fill_coherency(struct inode *inode,
 				 struct cifs_fscache_inode_coherency_data *cd)
 {
-	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 mtime = inode_get_mtime(inode);
 
 	memset(cd, 0, sizeof(*cd));
-	cd->last_write_time_sec   = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
-	cd->last_write_time_nsec  = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+	cd->last_write_time_sec   = cpu_to_le64(mtime.tv_sec);
+	cd->last_write_time_nsec  = cpu_to_le32(mtime.tv_nsec);
 	cd->last_change_time_sec  = cpu_to_le64(ctime.tv_sec);
 	cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
 }
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d7c302442c1e..09c5c0f5c96e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -82,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifs_fscache_inode_coherency_data cd;
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+	struct timespec64 mtime;
 
 	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
 		 __func__, cifs_i->uniqueid);
@@ -101,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 
 	 /* revalidate if mtime or size have changed */
 	fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
-	if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	mtime = inode_get_mtime(inode);
+	if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
 		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
 			 __func__, cifs_i->uniqueid);
@@ -164,10 +166,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
 	/* we do not want atime to be less than mtime, it broke some apps */
 	if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
-		inode->i_atime = fattr->cf_mtime;
+		inode_set_atime_to_ts(inode, fattr->cf_mtime);
 	else
-		inode->i_atime = fattr->cf_atime;
-	inode->i_mtime = fattr->cf_mtime;
+		inode_set_atime_to_ts(inode, fattr->cf_atime);
+	inode_set_mtime_to_ts(inode, fattr->cf_mtime);
 	inode_set_ctime_to_ts(inode, fattr->cf_ctime);
 	inode->i_rdev = fattr->cf_rdev;
 	cifs_nlink_fattr_to_inode(inode, fattr);
@@ -457,8 +459,7 @@ static int cifs_get_unix_fattr(const unsigned char *full_path,
 			return -EOPNOTSUPP;
 		rc = server->ops->query_symlink(xid, tcon,
 						cifs_sb, full_path,
-						&fattr->cf_symlink_target,
-						NULL);
+						&fattr->cf_symlink_target);
 		cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
 	}
 	return rc;
@@ -592,6 +593,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
 			cifs_dbg(FYI, "Symlink\n");
 			fattr->cf_mode |= S_IFLNK;
 			fattr->cf_dtype = DT_LNK;
+		} else if (memcmp("LnxFIFO", pbuf, 8) == 0) {
+			cifs_dbg(FYI, "FIFO\n");
+			fattr->cf_mode |= S_IFIFO;
+			fattr->cf_dtype = DT_FIFO;
 		} else {
 			fattr->cf_mode |= S_IFREG; /* file? */
 			fattr->cf_dtype = DT_REG;
@@ -716,10 +721,51 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
 		fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
 }
 
+static inline dev_t nfs_mkdev(struct reparse_posix_data *buf)
+{
+	u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
+
+	return MKDEV(v >> 32, v & 0xffffffff);
+}
+
 bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 				 struct cifs_fattr *fattr,
-				 u32 tag)
+				 struct cifs_open_info_data *data)
 {
+	struct reparse_posix_data *buf = data->reparse.posix;
+	u32 tag = data->reparse.tag;
+
+	if (tag == IO_REPARSE_TAG_NFS && buf) {
+		switch (le64_to_cpu(buf->InodeType)) {
+		case NFS_SPECFILE_CHR:
+			fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+			fattr->cf_dtype = DT_CHR;
+			fattr->cf_rdev = nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_BLK:
+			fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+			fattr->cf_dtype = DT_BLK;
+			fattr->cf_rdev = nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_FIFO:
+			fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+			fattr->cf_dtype = DT_FIFO;
+			break;
+		case NFS_SPECFILE_SOCK:
+			fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+			fattr->cf_dtype = DT_SOCK;
+			break;
+		case NFS_SPECFILE_LNK:
+			fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+			fattr->cf_dtype = DT_LNK;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return false;
+		}
+		return true;
+	}
+
 	switch (tag) {
 	case IO_REPARSE_TAG_LX_SYMLINK:
 		fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
@@ -744,7 +790,7 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
 	case 0: /* SMB1 symlink */
 	case IO_REPARSE_TAG_SYMLINK:
 	case IO_REPARSE_TAG_NFS:
-		fattr->cf_mode = S_IFLNK;
+		fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
 		fattr->cf_dtype = DT_LNK;
 		break;
 	default:
@@ -785,7 +831,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 
 	if (cifs_open_data_reparse(data) &&
-	    cifs_reparse_point_to_fattr(cifs_sb, fattr, data->reparse_tag))
+	    cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
 		goto out_reparse;
 
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
@@ -819,6 +865,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 
 out_reparse:
 	if (S_ISLNK(fattr->cf_mode)) {
+		if (likely(data->symlink_target))
+			fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX);
 		fattr->cf_symlink_target = data->symlink_target;
 		data->symlink_target = NULL;
 	}
@@ -850,7 +898,7 @@ cifs_get_file_info(struct file *filp)
 		data.adjust_tz = false;
 		if (data.symlink_target) {
 			data.symlink = true;
-			data.reparse_tag = IO_REPARSE_TAG_SYMLINK;
+			data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
 		}
 		cifs_open_info_to_fattr(&fattr, &data, inode->i_sb);
 		break;
@@ -1019,7 +1067,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct kvec rsp_iov, *iov = NULL;
 	int rsp_buftype = CIFS_NO_BUFFER;
-	u32 tag = data->reparse_tag;
+	u32 tag = data->reparse.tag;
 	int rc = 0;
 
 	if (!tag && server->ops->query_reparse_point) {
@@ -1029,22 +1077,28 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
 		if (!rc)
 			iov = &rsp_iov;
 	}
-	switch ((data->reparse_tag = tag)) {
+
+	rc = -EOPNOTSUPP;
+	switch ((data->reparse.tag = tag)) {
 	case 0: /* SMB1 symlink */
-		iov = NULL;
-		fallthrough;
-	case IO_REPARSE_TAG_NFS:
-	case IO_REPARSE_TAG_SYMLINK:
-		if (!data->symlink_target && server->ops->query_symlink) {
+		if (server->ops->query_symlink) {
 			rc = server->ops->query_symlink(xid, tcon,
 							cifs_sb, full_path,
-							&data->symlink_target,
-							iov);
+							&data->symlink_target);
 		}
 		break;
 	case IO_REPARSE_TAG_MOUNT_POINT:
 		cifs_create_junction_fattr(fattr, sb);
+		rc = 0;
 		goto out;
+	default:
+		if (data->symlink_target) {
+			rc = 0;
+		} else if (server->ops->parse_reparse_point) {
+			rc = server->ops->parse_reparse_point(cifs_sb,
+							      iov, data);
+		}
+		break;
 	}
 
 	cifs_open_info_to_fattr(fattr, data, sb);
@@ -1816,7 +1870,7 @@ out_reval:
 					   when needed */
 		inode_set_ctime_current(inode);
 	}
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	cifs_inode = CIFS_I(dir);
 	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
 unlink_out:
@@ -2131,7 +2185,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifsInode->time = 0;
 
 	inode_set_ctime_current(d_inode(direntry));
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 
 rmdir_exit:
 	free_dentry_path(page);
@@ -2337,9 +2391,6 @@ unlink_target:
 	/* force revalidate to go get info when needed */
 	CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
 
-	source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
-									  inode_set_ctime_current(target_dir));
-
 cifs_rename_exit:
 	kfree(info_buf_source);
 	free_dentry_path(page2);
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index f7160003e0ed..e2f92c21fff5 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -117,6 +117,20 @@ out_drop_write:
 	return rc;
 }
 
+static long smb_mnt_get_tcon_info(struct cifs_tcon *tcon, void __user *arg)
+{
+	int rc = 0;
+	struct smb_mnt_tcon_info tcon_inf;
+
+	tcon_inf.tid = tcon->tid;
+	tcon_inf.session_id = tcon->ses->Suid;
+
+	if (copy_to_user(arg, &tcon_inf, sizeof(struct smb_mnt_tcon_info)))
+		rc = -EFAULT;
+
+	return rc;
+}
+
 static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
 				void __user *arg)
 {
@@ -129,6 +143,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
 
 	fsinf->version = 1;
 	fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+	fsinf->tcon_flags = tcon->Flags;
 	fsinf->device_characteristics =
 			le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
 	fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
@@ -414,6 +429,17 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			tcon = tlink_tcon(pSMBFile->tlink);
 			rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
 			break;
+		case CIFS_IOC_GET_TCON_INFO:
+			cifs_sb = CIFS_SB(inode->i_sb);
+			tlink = cifs_sb_tlink(cifs_sb);
+			if (IS_ERR(tlink)) {
+				rc = PTR_ERR(tlink);
+				break;
+			}
+			tcon = tlink_tcon(tlink);
+			rc = smb_mnt_get_tcon_info(tcon, (void __user *)arg);
+			cifs_put_tlink(tlink);
+			break;
 		case CIFS_ENUMERATE_SNAPSHOTS:
 			if (pSMBFile == NULL)
 				break;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index c66be4904e1f..a1da50e66fbb 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -42,23 +42,11 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 
 	rc = cifs_alloc_hash("md5", &md5);
 	if (rc)
-		goto symlink_hash_err;
+		return rc;
 
-	rc = crypto_shash_init(md5);
-	if (rc) {
-		cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
-		goto symlink_hash_err;
-	}
-	rc = crypto_shash_update(md5, link_str, link_len);
-	if (rc) {
-		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
-		goto symlink_hash_err;
-	}
-	rc = crypto_shash_final(md5, md5_hash);
+	rc = crypto_shash_digest(md5, link_str, link_len, md5_hash);
 	if (rc)
 		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
-
-symlink_hash_err:
 	cifs_free_hash(&md5);
 	return rc;
 }
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 35b176457bbe..c2137ea3c253 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -363,6 +363,10 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 			cifs_dbg(VFS, "Length less than smb header size\n");
 		}
 		return -EIO;
+	} else if (total_read < sizeof(*smb) + 2 * smb->WordCount) {
+		cifs_dbg(VFS, "%s: can't read BCC due to invalid WordCount(%u)\n",
+			 __func__, smb->WordCount);
+		return -EIO;
 	}
 
 	/* otherwise, there is enough to get to the BCC */
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index c8f5ed8a69f1..a6968573b775 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -117,6 +117,18 @@ cifs_build_devname(char *nodename, const char *prepath)
 	return dev;
 }
 
+static bool is_dfs_mount(struct dentry *dentry)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+	bool ret;
+
+	spin_lock(&tcon->tc_lock);
+	ret = !!tcon->origin_fullpath;
+	spin_unlock(&tcon->tc_lock);
+	return ret;
+}
+
 /* Return full path out of a dentry set for automount */
 static char *automount_fullpath(struct dentry *dentry, void *page)
 {
@@ -212,8 +224,9 @@ static struct vfsmount *cifs_do_automount(struct path *path)
 		ctx->source = NULL;
 		goto out;
 	}
-	cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s\n",
-		 __func__, ctx->source, ctx->UNC, ctx->prepath);
+	ctx->dfs_automount = is_dfs_mount(mntpt);
+	cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dfs_automount=%d\n",
+		 __func__, ctx->source, ctx->UNC, ctx->prepath, ctx->dfs_automount);
 
 	mnt = fc_mount(fc);
 out:
diff --git a/fs/smb/client/ntlmssp.h b/fs/smb/client/ntlmssp.h
index 2c5dde2ece58..875de43b72de 100644
--- a/fs/smb/client/ntlmssp.h
+++ b/fs/smb/client/ntlmssp.h
@@ -133,8 +133,8 @@ typedef struct _AUTHENTICATE_MESSAGE {
 	SECURITY_BUFFER WorkstationName;
 	SECURITY_BUFFER SessionKey;
 	__le32 NegotiateFlags;
-	/* SECURITY_BUFFER for version info not present since we
-	   do not set the version is present flag */
+	struct	ntlmssp_version Version;
+	/* SECURITY_BUFFER */
 	char UserString[];
 } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
 
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 47fc22de8d20..d30ea2005eb3 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -153,6 +153,10 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
 static void
 cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 {
+	struct cifs_open_info_data data = {
+		.reparse = { .tag = fattr->cf_cifstag, },
+	};
+
 	fattr->cf_uid = cifs_sb->ctx->linux_uid;
 	fattr->cf_gid = cifs_sb->ctx->linux_gid;
 
@@ -165,7 +169,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 	 * reasonably map some of them to directories vs. files vs. symlinks
 	 */
 	if ((fattr->cf_cifsattrs & ATTR_REPARSE) &&
-	    cifs_reparse_point_to_fattr(cifs_sb, fattr, fattr->cf_cifstag))
+	    cifs_reparse_point_to_fattr(cifs_sb, fattr, &data))
 		goto out_reparse;
 
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 79f26c560edf..2d3b332a79a1 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -24,7 +24,7 @@
 #include "fs_context.h"
 
 static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
 		     struct cifs_server_iface *iface);
 
 bool
@@ -69,7 +69,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
 
 /* channel helper functions. assumed that chan_lock is held by caller. */
 
-unsigned int
+int
 cifs_ses_get_chan_index(struct cifs_ses *ses,
 			struct TCP_Server_Info *server)
 {
@@ -85,14 +85,17 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
 		cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
 			 server->conn_id);
 	WARN_ON(1);
-	return 0;
+	return CIFS_INVAL_CHAN_INDEX;
 }
 
 void
 cifs_chan_set_in_reconnect(struct cifs_ses *ses,
 			     struct TCP_Server_Info *server)
 {
-	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+	int chan_index = cifs_ses_get_chan_index(ses, server);
+
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return;
 
 	ses->chans[chan_index].in_reconnect = true;
 }
@@ -103,6 +106,9 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return;
+
 	ses->chans[chan_index].in_reconnect = false;
 }
 
@@ -112,6 +118,9 @@ cifs_chan_in_reconnect(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return true;	/* err on the safer side */
+
 	return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
 }
 
@@ -121,6 +130,9 @@ cifs_chan_set_need_reconnect(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return;
+
 	set_bit(chan_index, &ses->chans_need_reconnect);
 	cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n",
 		 chan_index, ses->chans_need_reconnect);
@@ -132,6 +144,9 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return;
+
 	clear_bit(chan_index, &ses->chans_need_reconnect);
 	cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n",
 		 chan_index, ses->chans_need_reconnect);
@@ -143,6 +158,9 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return true;	/* err on the safer side */
+
 	return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
 }
 
@@ -152,19 +170,24 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
 {
 	unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
 
+	if (chan_index == CIFS_INVAL_CHAN_INDEX)
+		return true;	/* err on the safer side */
+
 	return ses->chans[chan_index].iface &&
 		ses->chans[chan_index].iface->is_active;
 }
 
 /* returns number of channels added */
-int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
+int cifs_try_adding_channels(struct cifs_ses *ses)
 {
 	struct TCP_Server_Info *server = ses->server;
 	int old_chan_count, new_chan_count;
 	int left;
 	int rc = 0;
 	int tries = 0;
+	size_t iface_weight = 0, iface_min_speed = 0;
 	struct cifs_server_iface *iface = NULL, *niface = NULL;
+	struct cifs_server_iface *last_iface = NULL;
 
 	spin_lock(&ses->chan_lock);
 
@@ -186,28 +209,17 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
 	}
 
 	if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
-		ses->chan_max = 1;
 		spin_unlock(&ses->chan_lock);
 		cifs_server_dbg(VFS, "no multichannel support\n");
 		return 0;
 	}
 	spin_unlock(&ses->chan_lock);
 
-	/*
-	 * Keep connecting to same, fastest, iface for all channels as
-	 * long as its RSS. Try next fastest one if not RSS or channel
-	 * creation fails.
-	 */
-	spin_lock(&ses->iface_lock);
-	iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
-				 iface_head);
-	spin_unlock(&ses->iface_lock);
-
 	while (left > 0) {
 
 		tries++;
 		if (tries > 3*ses->chan_max) {
-			cifs_dbg(FYI, "too many channel open attempts (%d channels left to open)\n",
+			cifs_dbg(VFS, "too many channel open attempts (%d channels left to open)\n",
 				 left);
 			break;
 		}
@@ -215,23 +227,41 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
 		spin_lock(&ses->iface_lock);
 		if (!ses->iface_count) {
 			spin_unlock(&ses->iface_lock);
+			cifs_dbg(VFS, "server %s does not advertise interfaces\n",
+				      ses->server->hostname);
 			break;
 		}
 
+		if (!iface)
+			iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+						 iface_head);
+		last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+					     iface_head);
+		iface_min_speed = last_iface->speed;
+
 		list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
 				    iface_head) {
+			/* do not mix rdma and non-rdma interfaces */
+			if (iface->rdma_capable != ses->server->rdma)
+				continue;
+
 			/* skip ifaces that are unusable */
 			if (!iface->is_active ||
 			    (is_ses_using_iface(ses, iface) &&
-			     !iface->rss_capable)) {
+			     !iface->rss_capable))
+				continue;
+
+			/* check if we already allocated enough channels */
+			iface_weight = iface->speed / iface_min_speed;
+
+			if (iface->weight_fulfilled >= iface_weight)
 				continue;
-			}
 
 			/* take ref before unlock */
 			kref_get(&iface->refcount);
 
 			spin_unlock(&ses->iface_lock);
-			rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+			rc = cifs_ses_add_channel(ses, iface);
 			spin_lock(&ses->iface_lock);
 
 			if (rc) {
@@ -242,10 +272,21 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
 				continue;
 			}
 
-			cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+			iface->num_channels++;
+			iface->weight_fulfilled++;
+			cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n",
 				 &iface->sockaddr);
 			break;
 		}
+
+		/* reached end of list. reset weight_fulfilled and start over */
+		if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+			list_for_each_entry(iface, &ses->iface_list, iface_head)
+				iface->weight_fulfilled = 0;
+			spin_unlock(&ses->iface_lock);
+			iface = NULL;
+			continue;
+		}
 		spin_unlock(&ses->iface_lock);
 
 		left--;
@@ -256,6 +297,64 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
 }
 
 /*
+ * called when multichannel is disabled by the server.
+ * this always gets called from smb2_reconnect
+ * and cannot get called in parallel threads.
+ */
+void
+cifs_disable_secondary_channels(struct cifs_ses *ses)
+{
+	int i, chan_count;
+	struct TCP_Server_Info *server;
+	struct cifs_server_iface *iface;
+
+	spin_lock(&ses->chan_lock);
+	chan_count = ses->chan_count;
+	if (chan_count == 1)
+		goto done;
+
+	ses->chan_count = 1;
+
+	/* for all secondary channels reset the need reconnect bit */
+	ses->chans_need_reconnect &= 1;
+
+	for (i = 1; i < chan_count; i++) {
+		iface = ses->chans[i].iface;
+		server = ses->chans[i].server;
+
+		/*
+		 * remove these references first, since we need to unlock
+		 * the chan_lock here, since iface_lock is a higher lock
+		 */
+		ses->chans[i].iface = NULL;
+		ses->chans[i].server = NULL;
+		spin_unlock(&ses->chan_lock);
+
+		if (iface) {
+			spin_lock(&ses->iface_lock);
+			iface->num_channels--;
+			if (iface->weight_fulfilled)
+				iface->weight_fulfilled--;
+			kref_put(&iface->refcount, release_iface);
+			spin_unlock(&ses->iface_lock);
+		}
+
+		if (server) {
+			if (!server->terminate) {
+				server->terminate = true;
+				cifs_signal_cifsd_for_reconnect(server, false);
+			}
+			cifs_put_tcp_session(server, false);
+		}
+
+		spin_lock(&ses->chan_lock);
+	}
+
+done:
+	spin_unlock(&ses->chan_lock);
+}
+
+/*
  * update the iface for the channel if necessary.
  * will return 0 when iface is updated, 1 if removed, 2 otherwise
  * Must be called with chan_lock held.
@@ -264,13 +363,16 @@ int
 cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
 {
 	unsigned int chan_index;
+	size_t iface_weight = 0, iface_min_speed = 0;
 	struct cifs_server_iface *iface = NULL;
 	struct cifs_server_iface *old_iface = NULL;
+	struct cifs_server_iface *last_iface = NULL;
+	struct sockaddr_storage ss;
 	int rc = 0;
 
 	spin_lock(&ses->chan_lock);
 	chan_index = cifs_ses_get_chan_index(ses, server);
-	if (!chan_index) {
+	if (chan_index == CIFS_INVAL_CHAN_INDEX) {
 		spin_unlock(&ses->chan_lock);
 		return 0;
 	}
@@ -284,14 +386,49 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
 	}
 	spin_unlock(&ses->chan_lock);
 
+	spin_lock(&server->srv_lock);
+	ss = server->dstaddr;
+	spin_unlock(&server->srv_lock);
+
 	spin_lock(&ses->iface_lock);
+	if (!ses->iface_count) {
+		spin_unlock(&ses->iface_lock);
+		cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
+		return 0;
+	}
+
+	last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
+				     iface_head);
+	iface_min_speed = last_iface->speed;
+
 	/* then look for a new one */
 	list_for_each_entry(iface, &ses->iface_list, iface_head) {
+		if (!chan_index) {
+			/* if we're trying to get the updated iface for primary channel */
+			if (!cifs_match_ipaddr((struct sockaddr *) &ss,
+					       (struct sockaddr *) &iface->sockaddr))
+				continue;
+
+			kref_get(&iface->refcount);
+			break;
+		}
+
+		/* do not mix rdma and non-rdma interfaces */
+		if (iface->rdma_capable != server->rdma)
+			continue;
+
 		if (!iface->is_active ||
 		    (is_ses_using_iface(ses, iface) &&
 		     !iface->rss_capable)) {
 			continue;
 		}
+
+		/* check if we already allocated enough channels */
+		iface_weight = iface->speed / iface_min_speed;
+
+		if (iface->weight_fulfilled >= iface_weight)
+			continue;
+
 		kref_get(&iface->refcount);
 		break;
 	}
@@ -302,34 +439,52 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
 		cifs_dbg(FYI, "unable to find a suitable iface\n");
 	}
 
+	if (!iface) {
+		cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
+			 &ss);
+		spin_unlock(&ses->iface_lock);
+		return 0;
+	}
+
 	/* now drop the ref to the current iface */
-	if (old_iface && iface) {
+	if (old_iface) {
 		cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
 			 &old_iface->sockaddr,
 			 &iface->sockaddr);
+
+		old_iface->num_channels--;
+		if (old_iface->weight_fulfilled)
+			old_iface->weight_fulfilled--;
+		iface->num_channels++;
+		iface->weight_fulfilled++;
+
 		kref_put(&old_iface->refcount, release_iface);
 	} else if (old_iface) {
-		cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+		/* if a new candidate is not found, keep things as is */
+		cifs_dbg(FYI, "could not replace iface: %pIS\n",
 			 &old_iface->sockaddr);
-		kref_put(&old_iface->refcount, release_iface);
-	} else {
-		WARN_ON(!iface);
-		cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+	} else if (!chan_index) {
+		/* special case: update interface for primary channel */
+		if (iface) {
+			cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
+				 &iface->sockaddr);
+			iface->num_channels++;
+			iface->weight_fulfilled++;
+		}
 	}
 	spin_unlock(&ses->iface_lock);
 
-	spin_lock(&ses->chan_lock);
-	chan_index = cifs_ses_get_chan_index(ses, server);
-	ses->chans[chan_index].iface = iface;
-
-	/* No iface is found. if secondary chan, drop connection */
-	if (!iface && SERVER_IS_CHAN(server))
-		ses->chans[chan_index].server = NULL;
-
-	spin_unlock(&ses->chan_lock);
+	if (iface) {
+		spin_lock(&ses->chan_lock);
+		chan_index = cifs_ses_get_chan_index(ses, server);
+		if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+			spin_unlock(&ses->chan_lock);
+			return 0;
+		}
 
-	if (!iface && SERVER_IS_CHAN(server))
-		cifs_put_tcp_session(server, false);
+		ses->chans[chan_index].iface = iface;
+		spin_unlock(&ses->chan_lock);
+	}
 
 	return rc;
 }
@@ -355,7 +510,7 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server)
 }
 
 static int
-cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+cifs_ses_add_channel(struct cifs_ses *ses,
 		     struct cifs_server_iface *iface)
 {
 	struct TCP_Server_Info *chan_server;
@@ -434,7 +589,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
 	 * This will be used for encoding/decoding user/domain/pw
 	 * during sess setup auth.
 	 */
-	ctx->local_nls = cifs_sb->local_nls;
+	ctx->local_nls = ses->local_nls;
 
 	/* Use RDMA if possible */
 	ctx->rdma = iface->rdma_capable;
@@ -480,20 +635,16 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
 
 	rc = cifs_negotiate_protocol(xid, ses, chan->server);
 	if (!rc)
-		rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls);
+		rc = cifs_setup_session(xid, ses, chan->server, ses->local_nls);
 
 	mutex_unlock(&ses->session_mutex);
 
 out:
 	if (rc && chan->server) {
-		/*
-		 * we should avoid race with these delayed works before we
-		 * remove this channel
-		 */
-		cancel_delayed_work_sync(&chan->server->echo);
-		cancel_delayed_work_sync(&chan->server->reconnect);
+		cifs_put_tcp_session(chan->server, 0);
 
 		spin_lock(&ses->chan_lock);
+
 		/* we rely on all bits beyond chan_count to be clear */
 		cifs_chan_clear_need_reconnect(ses, chan->server);
 		ses->chan_count--;
@@ -503,8 +654,6 @@ out:
 		 */
 		WARN_ON(ses->chan_count < 1);
 		spin_unlock(&ses->chan_lock);
-
-		cifs_put_tcp_session(chan->server, 0);
 	}
 
 	kfree(ctx->UNC);
@@ -536,8 +685,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
-	/* BB verify whether signing required on neg or just on auth frame
-	   (and NTLM case) */
+	/* BB verify whether signing required on neg or just auth frame (and NTLM case) */
 
 	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
 			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
@@ -594,8 +742,10 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
 
 	/* copy domain */
 	if (ses->domainName == NULL) {
-		/* Sending null domain better than using a bogus domain name (as
-		we did briefly in 2.6.18) since server will use its default */
+		/*
+		 * Sending null domain better than using a bogus domain name (as
+		 * we did briefly in 2.6.18) since server will use its default
+		 */
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
 		bytes_ret = 0;
@@ -614,8 +764,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 	char *bcc_ptr = *pbcc_area;
 	int bytes_ret = 0;
 
-	/* BB FIXME add check that strings total less
-	than 335 or will need to send them as arrays */
+	/* BB FIXME add check that strings less than 335 or will need to send as arrays */
 
 	/* copy user */
 	if (ses->user_name == NULL) {
@@ -660,8 +809,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 		if (WARN_ON_ONCE(len < 0))
 			len = CIFS_MAX_DOMAINNAME_LEN - 1;
 		bcc_ptr += len;
-	} /* else we will send a null domain name
-	     so the server will default to its own domain */
+	} /* else we send a null domain name so server will default to its own domain */
 	*bcc_ptr = 0;
 	bcc_ptr++;
 
@@ -757,11 +905,14 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 	if (len > bleft)
 		return;
 
-	/* No domain field in LANMAN case. Domain is
-	   returned by old servers in the SMB negprot response */
-	/* BB For newer servers which do not support Unicode,
-	   but thus do return domain here we could add parsing
-	   for it later, but it is not very important */
+	/*
+	 * No domain field in LANMAN case. Domain is
+	 * returned by old servers in the SMB negprot response
+	 *
+	 * BB For newer servers which do not support Unicode,
+	 * but thus do return domain here, we could add parsing
+	 * for it later, but it is not very important
+	 */
 	cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
 }
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -817,9 +968,12 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	ses->ntlmssp->server_flags = server_flags;
 
 	memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE);
-	/* In particular we can examine sign flags */
-	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
-		we must set the MIC field of the AUTHENTICATE_MESSAGE */
+	/*
+	 * In particular we can examine sign flags
+	 *
+	 * BB spec says that if AvId field of MsvAvTimestamp is populated then
+	 * we must set the MIC field of the AUTHENTICATE_MESSAGE
+	 */
 
 	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
 	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
@@ -1060,10 +1214,16 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmAuthenticate;
 
+	/* send version information in ntlmssp authenticate also */
 	flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET |
-		NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
-	/* we only send version information in ntlmssp negotiate, so do not set this flag */
-	flags = flags & ~NTLMSSP_NEGOTIATE_VERSION;
+		NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_VERSION |
+		NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED;
+
+	sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR;
+	sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL;
+	sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD);
+	sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3;
+
 	tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
 	sec_blob->NegotiateFlags = cpu_to_le32(flags);
 
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 9bf8735cdd1e..a9eaba8083b0 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -976,64 +976,37 @@ static int cifs_query_symlink(const unsigned int xid,
 			      struct cifs_tcon *tcon,
 			      struct cifs_sb_info *cifs_sb,
 			      const char *full_path,
-			      char **target_path,
-			      struct kvec *rsp_iov)
+			      char **target_path)
 {
 	int rc;
-	int oplock = 0;
-	bool is_reparse_point = !!rsp_iov;
-	struct cifs_fid fid;
-	struct cifs_open_parms oparms;
 
-	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+	cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
 
-	if (is_reparse_point) {
-		cifs_dbg(VFS, "reparse points not handled for SMB1 symlinks\n");
+	if (!cap_unix(tcon->ses))
 		return -EOPNOTSUPP;
-	}
-
-	/* Check for unix extensions */
-	if (cap_unix(tcon->ses)) {
-		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
-					     cifs_sb->local_nls,
-					     cifs_remap(cifs_sb));
-		if (rc == -EREMOTE)
-			rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
-						    target_path,
-						    cifs_sb->local_nls);
-
-		goto out;
-	}
-
-	oparms = (struct cifs_open_parms) {
-		.tcon = tcon,
-		.cifs_sb = cifs_sb,
-		.desired_access = FILE_READ_ATTRIBUTES,
-		.create_options = cifs_create_options(cifs_sb,
-						      OPEN_REPARSE_POINT),
-		.disposition = FILE_OPEN,
-		.path = full_path,
-		.fid = &fid,
-	};
-
-	rc = CIFS_open(xid, &oparms, &oplock, NULL);
-	if (rc)
-		goto out;
-
-	rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
-				 cifs_sb->local_nls);
-	if (rc)
-		goto out_close;
 
-	convert_delimiter(*target_path, '/');
-out_close:
-	CIFSSMBClose(xid, tcon, fid.netfid);
-out:
-	if (!rc)
-		cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+				     cifs_sb->local_nls, cifs_remap(cifs_sb));
+	if (rc == -EREMOTE)
+		rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+					    target_path, cifs_sb->local_nls);
 	return rc;
 }
 
+static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+				    struct kvec *rsp_iov,
+				    struct cifs_open_info_data *data)
+{
+	struct reparse_data_buffer *buf;
+	TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base;
+	bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE);
+	u32 plen = le16_to_cpu(io->ByteCount);
+
+	buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
+					     le32_to_cpu(io->DataOffset));
+	return parse_reparse_point(buf, plen, cifs_sb, unicode, data);
+}
+
 static bool
 cifs_is_read_op(__u32 oplock)
 {
@@ -1068,15 +1041,7 @@ cifs_make_node(unsigned int xid, struct inode *inode,
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct inode *newinode = NULL;
-	int rc = -EPERM;
-	struct cifs_open_info_data buf = {};
-	struct cifs_io_parms io_parms;
-	__u32 oplock = 0;
-	struct cifs_fid fid;
-	struct cifs_open_parms oparms;
-	unsigned int bytes_written;
-	struct win_dev *pdev;
-	struct kvec iov[2];
+	int rc;
 
 	if (tcon->unix_ext) {
 		/*
@@ -1110,74 +1075,18 @@ cifs_make_node(unsigned int xid, struct inode *inode,
 			d_instantiate(dentry, newinode);
 		return rc;
 	}
-
 	/*
-	 * SMB1 SFU emulation: should work with all servers, but only
-	 * support block and char device (no socket & fifo)
+	 * Check if mounted with mount parm 'sfu' mount parm.
+	 * SFU emulation should work with all servers, but only
+	 * supports block and char device (no socket & fifo),
+	 * and was used by default in earlier versions of Windows
 	 */
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-		return rc;
-
-	if (!S_ISCHR(mode) && !S_ISBLK(mode))
-		return rc;
-
-	cifs_dbg(FYI, "sfu compat create special file\n");
-
-	oparms = (struct cifs_open_parms) {
-		.tcon = tcon,
-		.cifs_sb = cifs_sb,
-		.desired_access = GENERIC_WRITE,
-		.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
-						      CREATE_OPTION_SPECIAL),
-		.disposition = FILE_CREATE,
-		.path = full_path,
-		.fid = &fid,
-	};
-
-	if (tcon->ses->server->oplocks)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
-	if (rc)
-		return rc;
-
-	/*
-	 * BB Do not bother to decode buf since no local inode yet to put
-	 * timestamps in, but we can reuse it safely.
-	 */
-
-	pdev = (struct win_dev *)&buf.fi;
-	io_parms.pid = current->tgid;
-	io_parms.tcon = tcon;
-	io_parms.offset = 0;
-	io_parms.length = sizeof(struct win_dev);
-	iov[1].iov_base = &buf.fi;
-	iov[1].iov_len = sizeof(struct win_dev);
-	if (S_ISCHR(mode)) {
-		memcpy(pdev->type, "IntxCHR", 8);
-		pdev->major = cpu_to_le64(MAJOR(dev));
-		pdev->minor = cpu_to_le64(MINOR(dev));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
-	} else if (S_ISBLK(mode)) {
-		memcpy(pdev->type, "IntxBLK", 8);
-		pdev->major = cpu_to_le64(MAJOR(dev));
-		pdev->minor = cpu_to_le64(MINOR(dev));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
-	}
-	tcon->ses->server->ops->close(xid, tcon, &fid);
-	d_drop(dentry);
-
-	/* FIXME: add code here to set EAs */
-
-	cifs_free_open_info(&buf);
-	return rc;
+		return -EPERM;
+	return cifs_sfu_make_node(xid, inode, dentry, tcon,
+				  full_path, mode, dev);
 }
 
-
-
 struct smb_version_operations smb1_operations = {
 	.send_cancel = send_nt_cancel,
 	.compare_fids = cifs_compare_fids,
@@ -1214,6 +1123,7 @@ struct smb_version_operations smb1_operations = {
 	.is_path_accessible = cifs_is_path_accessible,
 	.can_echo = cifs_can_echo,
 	.query_path_info = cifs_query_path_info,
+	.query_reparse_point = cifs_query_reparse_point,
 	.query_file_info = cifs_query_file_info,
 	.get_srv_inum = cifs_get_srv_inum,
 	.set_path_size = CIFSSMBSetEOF,
@@ -1229,6 +1139,7 @@ struct smb_version_operations smb1_operations = {
 	.rename = CIFSSMBRename,
 	.create_hardlink = CIFSCreateHardLink,
 	.query_symlink = cifs_query_symlink,
+	.parse_reparse_point = cifs_parse_reparse_point,
 	.open = cifs_open_file,
 	.set_fid = cifs_set_fid,
 	.close = cifs_close_file,
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 0b89f7008ac0..c94940af5d4b 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -555,7 +555,7 @@ static int parse_create_response(struct cifs_open_info_data *data,
 		break;
 	}
 	data->reparse_point = reparse_point;
-	data->reparse_tag = tag;
+	data->reparse.tag = tag;
 	return rc;
 }
 
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 25f7cd6f23d6..82b84a4941dd 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -173,6 +173,21 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
 	}
 
 	mid = le64_to_cpu(shdr->MessageId);
+	if (check_smb2_hdr(shdr, mid))
+		return 1;
+
+	if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+		cifs_dbg(VFS, "Invalid structure size %u\n",
+			 le16_to_cpu(shdr->StructureSize));
+		return 1;
+	}
+
+	command = le16_to_cpu(shdr->Command);
+	if (command >= NUMBER_OF_SMB2_COMMANDS) {
+		cifs_dbg(VFS, "Invalid SMB2 command %d\n", command);
+		return 1;
+	}
+
 	if (len < pdu_size) {
 		if ((len >= hdr_size)
 		    && (shdr->Status != 0)) {
@@ -193,21 +208,6 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
 		return 1;
 	}
 
-	if (check_smb2_hdr(shdr, mid))
-		return 1;
-
-	if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
-		cifs_dbg(VFS, "Invalid structure size %u\n",
-			 le16_to_cpu(shdr->StructureSize));
-		return 1;
-	}
-
-	command = le16_to_cpu(shdr->Command);
-	if (command >= NUMBER_OF_SMB2_COMMANDS) {
-		cifs_dbg(VFS, "Invalid SMB2 command %d\n", command);
-		return 1;
-	}
-
 	if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
 		if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
 		    pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) {
@@ -313,6 +313,9 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
 char *
 smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
 {
+	const int max_off = 4096;
+	const int max_len = 128 * 1024;
+
 	*off = 0;
 	*len = 0;
 
@@ -384,29 +387,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
 	 * Invalid length or offset probably means data area is invalid, but
 	 * we have little choice but to ignore the data area in this case.
 	 */
-	if (*off > 4096) {
-		cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
-		*len = 0;
+	if (unlikely(*off < 0 || *off > max_off ||
+		     *len < 0 || *len > max_len)) {
+		cifs_dbg(VFS, "%s: invalid data area (off=%d len=%d)\n",
+			 __func__, *off, *len);
 		*off = 0;
-	} else if (*off < 0) {
-		cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
-			 *off);
-		*off = 0;
-		*len = 0;
-	} else if (*len < 0) {
-		cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
-			 *len);
 		*len = 0;
-	} else if (*len > 128 * 1024) {
-		cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
+	} else if (*off == 0) {
 		*len = 0;
 	}
 
 	/* return pointer to beginning of data area, ie offset from SMB start */
-	if ((*off != 0) && (*len != 0))
+	if (*off > 0 && *len > 0)
 		return (char *)shdr + *off;
-	else
-		return NULL;
+	return NULL;
 }
 
 /*
@@ -787,7 +781,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
 {
 	struct close_cancelled_open *cancelled;
 
-	cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC);
+	cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
 	if (!cancelled)
 		return -ENOMEM;
 
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 9aeecee6b91b..14bc745de199 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -403,8 +403,10 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
 	cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
 		 shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
 		 shdr->Id.SyncId.ProcessId);
-	cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
-		 server->ops->calc_smb_size(buf));
+	if (!server->ops->check_message(buf, server->total_read, server)) {
+		cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
+				server->ops->calc_smb_size(buf));
+	}
 #endif
 }
 
@@ -593,16 +595,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
 	}
 
 	/*
-	 * Go through iface_list and do kref_put to remove
-	 * any unused ifaces. ifaces in use will be removed
-	 * when the last user calls a kref_put on it
+	 * Go through iface_list and mark them as inactive
 	 */
 	list_for_each_entry_safe(iface, niface, &ses->iface_list,
-				 iface_head) {
+				 iface_head)
 		iface->is_active = 0;
-		kref_put(&iface->refcount, release_iface);
-		ses->iface_count--;
-	}
+
 	spin_unlock(&ses->iface_lock);
 
 	/*
@@ -676,10 +674,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
 					 iface_head) {
 			ret = iface_cmp(iface, &tmp_iface);
 			if (!ret) {
-				/* just get a ref so that it doesn't get picked/freed */
 				iface->is_active = 1;
-				kref_get(&iface->refcount);
-				ses->iface_count++;
 				spin_unlock(&ses->iface_lock);
 				goto next_iface;
 			} else if (ret < 0) {
@@ -746,6 +741,20 @@ next_iface:
 	}
 
 out:
+	/*
+	 * Go through the list again and put the inactive entries
+	 */
+	spin_lock(&ses->iface_lock);
+	list_for_each_entry_safe(iface, niface, &ses->iface_list,
+				 iface_head) {
+		if (!iface->is_active) {
+			list_del(&iface->iface_head);
+			kref_put(&iface->refcount, release_iface);
+			ses->iface_count--;
+		}
+	}
+	spin_unlock(&ses->iface_lock);
+
 	return rc;
 }
 
@@ -756,6 +765,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
 	unsigned int ret_data_len = 0;
 	struct network_interface_info_ioctl_rsp *out_buf = NULL;
 	struct cifs_ses *ses = tcon->ses;
+	struct TCP_Server_Info *pserver;
 
 	/* do not query too frequently */
 	if (ses->iface_last_update &&
@@ -780,6 +790,16 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_
 	if (rc)
 		goto out;
 
+	/* check if iface is still active */
+	spin_lock(&ses->chan_lock);
+	pserver = ses->chans[0].server;
+	if (pserver && !cifs_chan_is_iface_active(ses, pserver)) {
+		spin_unlock(&ses->chan_lock);
+		cifs_chan_update_iface(ses, pserver);
+		spin_lock(&ses->chan_lock);
+	}
+	spin_unlock(&ses->chan_lock);
+
 out:
 	kfree(out_buf);
 	return rc;
@@ -1403,12 +1423,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	/* Creation time should not need to be updated on close */
 	if (file_inf.LastWriteTime)
-		inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
+		inode_set_mtime_to_ts(inode,
+				      cifs_NTtimeToUnix(file_inf.LastWriteTime));
 	if (file_inf.ChangeTime)
 		inode_set_ctime_to_ts(inode,
 				      cifs_NTtimeToUnix(file_inf.ChangeTime));
 	if (file_inf.LastAccessTime)
-		inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
+		inode_set_atime_to_ts(inode,
+				      cifs_NTtimeToUnix(file_inf.LastAccessTime));
 
 	/*
 	 * i_blocks is not related to (i_size / i_blksize),
@@ -2828,6 +2850,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 		usleep_range(512, 2048);
 	} while (++retry_count < 5);
 
+	if (!rc && !dfs_rsp)
+		rc = -EIO;
 	if (rc) {
 		if (!is_retryable_error(rc) && rc != -ENOENT && rc != -EOPNOTSUPP)
 			cifs_tcon_dbg(VFS, "%s: ioctl error: rc=%d\n", __func__, rc);
@@ -2858,115 +2882,119 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
-static int
-parse_reparse_posix(struct reparse_posix_data *symlink_buf,
-		      u32 plen, char **target_path,
-		      struct cifs_sb_info *cifs_sb)
+/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+static int parse_reparse_posix(struct reparse_posix_data *buf,
+			       struct cifs_sb_info *cifs_sb,
+			       struct cifs_open_info_data *data)
 {
 	unsigned int len;
-
-	/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-	len = le16_to_cpu(symlink_buf->ReparseDataLength);
-
-	if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) {
-		cifs_dbg(VFS, "%lld not a supported symlink type\n",
-			le64_to_cpu(symlink_buf->InodeType));
+	u64 type;
+
+	switch ((type = le64_to_cpu(buf->InodeType))) {
+	case NFS_SPECFILE_LNK:
+		len = le16_to_cpu(buf->ReparseDataLength);
+		data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
+							       len, true,
+							       cifs_sb->local_nls);
+		if (!data->symlink_target)
+			return -ENOMEM;
+		convert_delimiter(data->symlink_target, '/');
+		cifs_dbg(FYI, "%s: target path: %s\n",
+			 __func__, data->symlink_target);
+		break;
+	case NFS_SPECFILE_CHR:
+	case NFS_SPECFILE_BLK:
+	case NFS_SPECFILE_FIFO:
+	case NFS_SPECFILE_SOCK:
+		break;
+	default:
+		cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
+			 __func__, type);
 		return -EOPNOTSUPP;
 	}
-
-	*target_path = cifs_strndup_from_utf16(
-				symlink_buf->PathBuffer,
-				len, true, cifs_sb->local_nls);
-	if (!(*target_path))
-		return -ENOMEM;
-
-	convert_delimiter(*target_path, '/');
-	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
-
 	return 0;
 }
 
-static int
-parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf,
-		      u32 plen, char **target_path,
-		      struct cifs_sb_info *cifs_sb)
+static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+				 u32 plen, bool unicode,
+				 struct cifs_sb_info *cifs_sb,
+				 struct cifs_open_info_data *data)
 {
-	unsigned int sub_len;
-	unsigned int sub_offset;
+	unsigned int len;
+	unsigned int offs;
 
 	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
 
-	sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset);
-	sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength);
-	if (sub_offset + 20 > plen ||
-	    sub_offset + sub_len + 20 > plen) {
+	offs = le16_to_cpu(sym->SubstituteNameOffset);
+	len = le16_to_cpu(sym->SubstituteNameLength);
+	if (offs + 20 > plen || offs + len + 20 > plen) {
 		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
 		return -EIO;
 	}
 
-	*target_path = cifs_strndup_from_utf16(
-				symlink_buf->PathBuffer + sub_offset,
-				sub_len, true, cifs_sb->local_nls);
-	if (!(*target_path))
+	data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
+						       len, unicode,
+						       cifs_sb->local_nls);
+	if (!data->symlink_target)
 		return -ENOMEM;
 
-	convert_delimiter(*target_path, '/');
-	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	convert_delimiter(data->symlink_target, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
 
 	return 0;
 }
 
-static int
-parse_reparse_point(struct reparse_data_buffer *buf,
-		    u32 plen, char **target_path,
-		    struct cifs_sb_info *cifs_sb)
+int parse_reparse_point(struct reparse_data_buffer *buf,
+			u32 plen, struct cifs_sb_info *cifs_sb,
+			bool unicode, struct cifs_open_info_data *data)
 {
-	if (plen < sizeof(struct reparse_data_buffer)) {
-		cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n",
-			 plen);
+	if (plen < sizeof(*buf)) {
+		cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n",
+			 __func__, plen);
 		return -EIO;
 	}
 
-	if (plen < le16_to_cpu(buf->ReparseDataLength) +
-	    sizeof(struct reparse_data_buffer)) {
-		cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n",
-			 plen);
+	if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) {
+		cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n",
+			 __func__, plen);
 		return -EIO;
 	}
 
+	data->reparse.buf = buf;
+
 	/* See MS-FSCC 2.1.2 */
 	switch (le32_to_cpu(buf->ReparseTag)) {
 	case IO_REPARSE_TAG_NFS:
-		return parse_reparse_posix(
-			(struct reparse_posix_data *)buf,
-			plen, target_path, cifs_sb);
+		return parse_reparse_posix((struct reparse_posix_data *)buf,
+					   cifs_sb, data);
 	case IO_REPARSE_TAG_SYMLINK:
 		return parse_reparse_symlink(
 			(struct reparse_symlink_data_buffer *)buf,
-			plen, target_path, cifs_sb);
+			plen, unicode, cifs_sb, data);
+	case IO_REPARSE_TAG_LX_SYMLINK:
+	case IO_REPARSE_TAG_AF_UNIX:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_BLK:
+		return 0;
 	default:
-		cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n",
-			 le32_to_cpu(buf->ReparseTag));
+		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
+			 __func__, le32_to_cpu(buf->ReparseTag));
 		return -EOPNOTSUPP;
 	}
 }
 
-static int smb2_query_symlink(const unsigned int xid,
-			      struct cifs_tcon *tcon,
-			      struct cifs_sb_info *cifs_sb,
-			      const char *full_path,
-			      char **target_path,
-			      struct kvec *rsp_iov)
+static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+				    struct kvec *rsp_iov,
+				    struct cifs_open_info_data *data)
 {
 	struct reparse_data_buffer *buf;
 	struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
 	u32 plen = le32_to_cpu(io->OutputCount);
 
-	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
-
 	buf = (struct reparse_data_buffer *)((u8 *)io +
 					     le32_to_cpu(io->OutputOffset));
-	return parse_reparse_point(buf, plen, target_path, cifs_sb);
+	return parse_reparse_point(buf, plen, cifs_sb, true, data);
 }
 
 static int smb2_query_reparse_point(const unsigned int xid,
@@ -2989,7 +3017,7 @@ static int smb2_query_reparse_point(const unsigned int xid,
 	struct kvec *rsp_iov;
 	struct smb2_ioctl_rsp *ioctl_rsp;
 	struct reparse_data_buffer *reparse_buf;
-	u32 plen;
+	u32 off, count, len;
 
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
@@ -3070,16 +3098,22 @@ static int smb2_query_reparse_point(const unsigned int xid,
 	 */
 	if (rc == 0) {
 		/* See MS-FSCC 2.3.23 */
+		off = le32_to_cpu(ioctl_rsp->OutputOffset);
+		count = le32_to_cpu(ioctl_rsp->OutputCount);
+		if (check_add_overflow(off, count, &len) ||
+		    len > rsp_iov[1].iov_len) {
+			cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
+				      __func__, off, count);
+			rc = -EIO;
+			goto query_rp_exit;
+		}
 
-		reparse_buf = (struct reparse_data_buffer *)
-			((char *)ioctl_rsp +
-			 le32_to_cpu(ioctl_rsp->OutputOffset));
-		plen = le32_to_cpu(ioctl_rsp->OutputCount);
-
-		if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
-		    rsp_iov[1].iov_len) {
-			cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n",
-				 plen);
+		reparse_buf = (void *)((u8 *)ioctl_rsp + off);
+		len = sizeof(*reparse_buf);
+		if (count < len ||
+		    count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) {
+			cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
+				      __func__, off, count);
 			rc = -EIO;
 			goto query_rp_exit;
 		}
@@ -3299,6 +3333,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 	struct inode *inode = file_inode(file);
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct cifsFileInfo *cfile = file->private_data;
+	unsigned long long new_size;
 	long rc;
 	unsigned int xid;
 	__le64 eof;
@@ -3329,10 +3364,15 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 	/*
 	 * do we also need to change the size of the file?
 	 */
-	if (keep_size == false && i_size_read(inode) < offset + len) {
-		eof = cpu_to_le64(offset + len);
+	new_size = offset + len;
+	if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) {
+		eof = cpu_to_le64(new_size);
 		rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
 				  cfile->fid.volatile_fid, cfile->pid, &eof);
+		if (rc >= 0) {
+			truncate_setsize(inode, new_size);
+			fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
+		}
 	}
 
  zero_range_exit:
@@ -3727,6 +3767,9 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
 	if (rc < 0)
 		goto out_2;
 
+	truncate_setsize(inode, old_eof + len);
+	fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
+
 	rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
 	if (rc < 0)
 		goto out_2;
@@ -4920,6 +4963,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 	struct smb2_hdr *shdr;
 	unsigned int pdu_length = server->pdu_size;
 	unsigned int buf_size;
+	unsigned int next_cmd;
 	struct mid_q_entry *mid_entry;
 	int next_is_large;
 	char *next_buffer = NULL;
@@ -4948,14 +4992,15 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 	next_is_large = server->large_buf;
 one_more:
 	shdr = (struct smb2_hdr *)buf;
-	if (shdr->NextCommand) {
+	next_cmd = le32_to_cpu(shdr->NextCommand);
+	if (next_cmd) {
+		if (WARN_ON_ONCE(next_cmd > pdu_length))
+			return -1;
 		if (next_is_large)
 			next_buffer = (char *)cifs_buf_get();
 		else
 			next_buffer = (char *)cifs_small_buf_get();
-		memcpy(next_buffer,
-		       buf + le32_to_cpu(shdr->NextCommand),
-		       pdu_length - le32_to_cpu(shdr->NextCommand));
+		memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd);
 	}
 
 	mid_entry = smb2_find_mid(server, buf);
@@ -4979,8 +5024,8 @@ one_more:
 	else
 		ret = cifs_handle_standard(server, mid_entry);
 
-	if (ret == 0 && shdr->NextCommand) {
-		pdu_length -= le32_to_cpu(shdr->NextCommand);
+	if (ret == 0 && next_cmd) {
+		pdu_length -= next_cmd;
 		server->large_buf = next_is_large;
 		if (next_is_large)
 			server->bigbuf = buf = next_buffer;
@@ -5043,54 +5088,42 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 				NULL, 0, false);
 }
 
-static int
-smb2_next_header(char *buf)
+static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
+			    unsigned int *noff)
 {
 	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
 	struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
 
-	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
-		return sizeof(struct smb2_transform_hdr) +
-		  le32_to_cpu(t_hdr->OriginalMessageSize);
-
-	return le32_to_cpu(hdr->NextCommand);
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+		*noff = le32_to_cpu(t_hdr->OriginalMessageSize);
+		if (unlikely(check_add_overflow(*noff, sizeof(*t_hdr), noff)))
+			return -EINVAL;
+	} else {
+		*noff = le32_to_cpu(hdr->NextCommand);
+	}
+	if (unlikely(*noff && *noff < MID_HEADER_SIZE(server)))
+		return -EINVAL;
+	return 0;
 }
 
-static int
-smb2_make_node(unsigned int xid, struct inode *inode,
-	       struct dentry *dentry, struct cifs_tcon *tcon,
-	       const char *full_path, umode_t mode, dev_t dev)
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev)
 {
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	int rc = -EPERM;
 	struct cifs_open_info_data buf = {};
-	struct cifs_io_parms io_parms = {0};
-	__u32 oplock = 0;
-	struct cifs_fid fid;
+	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_open_parms oparms;
+	struct cifs_io_parms io_parms = {};
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_fid fid;
 	unsigned int bytes_written;
 	struct win_dev *pdev;
 	struct kvec iov[2];
+	__u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
+	int rc;
 
-	/*
-	 * Check if mounted with mount parm 'sfu' mount parm.
-	 * SFU emulation should work with all servers, but only
-	 * supports block and char device (no socket & fifo),
-	 * and was used by default in earlier versions of Windows
-	 */
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
-		return rc;
-
-	/*
-	 * TODO: Add ability to create instead via reparse point. Windows (e.g.
-	 * their current NFS server) uses this approach to expose special files
-	 * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
-	 */
-
-	if (!S_ISCHR(mode) && !S_ISBLK(mode))
-		return rc;
-
-	cifs_dbg(FYI, "sfu compat create special file\n");
+	if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
+		return -EPERM;
 
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
@@ -5103,11 +5136,7 @@ smb2_make_node(unsigned int xid, struct inode *inode,
 		.fid = &fid,
 	};
 
-	if (tcon->ses->server->oplocks)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-	rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
+	rc = server->ops->open(xid, &oparms, &oplock, &buf);
 	if (rc)
 		return rc;
 
@@ -5115,36 +5144,56 @@ smb2_make_node(unsigned int xid, struct inode *inode,
 	 * BB Do not bother to decode buf since no local inode yet to put
 	 * timestamps in, but we can reuse it safely.
 	 */
-
 	pdev = (struct win_dev *)&buf.fi;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
-	io_parms.offset = 0;
-	io_parms.length = sizeof(struct win_dev);
-	iov[1].iov_base = &buf.fi;
-	iov[1].iov_len = sizeof(struct win_dev);
+	io_parms.length = sizeof(*pdev);
+	iov[1].iov_base = pdev;
+	iov[1].iov_len = sizeof(*pdev);
 	if (S_ISCHR(mode)) {
 		memcpy(pdev->type, "IntxCHR", 8);
 		pdev->major = cpu_to_le64(MAJOR(dev));
 		pdev->minor = cpu_to_le64(MINOR(dev));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
 	} else if (S_ISBLK(mode)) {
 		memcpy(pdev->type, "IntxBLK", 8);
 		pdev->major = cpu_to_le64(MAJOR(dev));
 		pdev->minor = cpu_to_le64(MINOR(dev));
-		rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
-							&bytes_written, iov, 1);
+	} else if (S_ISFIFO(mode)) {
+		memcpy(pdev->type, "LnxFIFO", 8);
 	}
-	tcon->ses->server->ops->close(xid, tcon, &fid);
-	d_drop(dentry);
 
+	rc = server->ops->sync_write(xid, &fid, &io_parms,
+				     &bytes_written, iov, 1);
+	server->ops->close(xid, tcon, &fid);
+	d_drop(dentry);
 	/* FIXME: add code here to set EAs */
-
 	cifs_free_open_info(&buf);
 	return rc;
 }
 
+static int smb2_make_node(unsigned int xid, struct inode *inode,
+			  struct dentry *dentry, struct cifs_tcon *tcon,
+			  const char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+
+	/*
+	 * Check if mounted with mount parm 'sfu' mount parm.
+	 * SFU emulation should work with all servers, but only
+	 * supports block and char device (no socket & fifo),
+	 * and was used by default in earlier versions of Windows
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		return -EPERM;
+	/*
+	 * TODO: Add ability to create instead via reparse point. Windows (e.g.
+	 * their current NFS server) uses this approach to expose special files
+	 * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
+	 */
+	return cifs_sfu_make_node(xid, inode, dentry, tcon,
+				  full_path, mode, dev);
+}
+
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 struct smb_version_operations smb20_operations = {
 	.compare_fids = smb2_compare_fids,
@@ -5195,7 +5244,7 @@ struct smb_version_operations smb20_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
-	.query_symlink = smb2_query_symlink,
+	.parse_reparse_point = smb2_parse_reparse_point,
 	.query_mf_symlink = smb3_query_mf_symlink,
 	.create_mf_symlink = smb3_create_mf_symlink,
 	.open = smb2_open_file,
@@ -5297,7 +5346,7 @@ struct smb_version_operations smb21_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
-	.query_symlink = smb2_query_symlink,
+	.parse_reparse_point = smb2_parse_reparse_point,
 	.query_mf_symlink = smb3_query_mf_symlink,
 	.create_mf_symlink = smb3_create_mf_symlink,
 	.open = smb2_open_file,
@@ -5402,7 +5451,7 @@ struct smb_version_operations smb30_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
-	.query_symlink = smb2_query_symlink,
+	.parse_reparse_point = smb2_parse_reparse_point,
 	.query_mf_symlink = smb3_query_mf_symlink,
 	.create_mf_symlink = smb3_create_mf_symlink,
 	.open = smb2_open_file,
@@ -5516,7 +5565,7 @@ struct smb_version_operations smb311_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
-	.query_symlink = smb2_query_symlink,
+	.parse_reparse_point = smb2_parse_reparse_point,
 	.query_mf_symlink = smb3_query_mf_symlink,
 	.create_mf_symlink = smb3_create_mf_symlink,
 	.open = smb2_open_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index c75a80bb6d9e..4f971c1061f0 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -158,11 +158,14 @@ out:
 
 static int
 smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
-	       struct TCP_Server_Info *server)
+	       struct TCP_Server_Info *server, bool from_reconnect)
 {
 	int rc = 0;
 	struct nls_table *nls_codepage = NULL;
 	struct cifs_ses *ses;
+	int xid;
+	struct TCP_Server_Info *pserver;
+	unsigned int chan_index;
 
 	/*
 	 * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -223,6 +226,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
 			return -EAGAIN;
 		}
 	}
+
+	/* if server is marked for termination, cifsd will cleanup */
+	if (server->terminate) {
+		spin_unlock(&server->srv_lock);
+		return -EHOSTDOWN;
+	}
 	spin_unlock(&server->srv_lock);
 
 again:
@@ -242,11 +251,23 @@ again:
 
 	mutex_lock(&ses->session_mutex);
 	/*
+	 * if this is called by delayed work, and the channel has been disabled
+	 * in parallel, the delayed work can continue to execute in parallel
+	 * there's a chance that this channel may not exist anymore
+	 */
+	spin_lock(&server->srv_lock);
+	if (server->tcpStatus == CifsExiting) {
+		spin_unlock(&server->srv_lock);
+		mutex_unlock(&ses->session_mutex);
+		rc = -EHOSTDOWN;
+		goto out;
+	}
+
+	/*
 	 * Recheck after acquire mutex. If another thread is negotiating
 	 * and the server never sends an answer the socket will be closed
 	 * and tcpStatus set to reconnect.
 	 */
-	spin_lock(&server->srv_lock);
 	if (server->tcpStatus == CifsNeedReconnect) {
 		spin_unlock(&server->srv_lock);
 		mutex_unlock(&ses->session_mutex);
@@ -283,6 +304,53 @@ again:
 
 	rc = cifs_negotiate_protocol(0, ses, server);
 	if (!rc) {
+		/*
+		 * if server stopped supporting multichannel
+		 * and the first channel reconnected, disable all the others.
+		 */
+		if (ses->chan_count > 1 &&
+		    !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+			if (SERVER_IS_CHAN(server)) {
+				cifs_dbg(VFS, "server %s does not support " \
+					 "multichannel anymore. skipping secondary channel\n",
+					 ses->server->hostname);
+
+				spin_lock(&ses->chan_lock);
+				chan_index = cifs_ses_get_chan_index(ses, server);
+				if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+					spin_unlock(&ses->chan_lock);
+					goto skip_terminate;
+				}
+
+				ses->chans[chan_index].server = NULL;
+				spin_unlock(&ses->chan_lock);
+
+				/*
+				 * the above reference of server by channel
+				 * needs to be dropped without holding chan_lock
+				 * as cifs_put_tcp_session takes a higher lock
+				 * i.e. cifs_tcp_ses_lock
+				 */
+				cifs_put_tcp_session(server, from_reconnect);
+
+				server->terminate = true;
+				cifs_signal_cifsd_for_reconnect(server, false);
+
+				/* mark primary server as needing reconnect */
+				pserver = server->primary_server;
+				cifs_signal_cifsd_for_reconnect(pserver, false);
+
+skip_terminate:
+				mutex_unlock(&ses->session_mutex);
+				rc = -EHOSTDOWN;
+				goto out;
+			} else {
+				cifs_server_dbg(VFS, "does not support " \
+					 "multichannel anymore. disabling all other channels\n");
+				cifs_disable_secondary_channels(ses);
+			}
+		}
+
 		rc = cifs_setup_session(0, ses, server, nls_codepage);
 		if ((rc == -EACCES) && !tcon->retry) {
 			mutex_unlock(&ses->session_mutex);
@@ -307,15 +375,41 @@ skip_sess_setup:
 		tcon->need_reopen_files = true;
 
 	rc = cifs_tree_connect(0, tcon, nls_codepage);
-	mutex_unlock(&ses->session_mutex);
 
 	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 	if (rc) {
 		/* If sess reconnected but tcon didn't, something strange ... */
+		mutex_unlock(&ses->session_mutex);
 		cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
 		goto out;
 	}
 
+	if (!rc &&
+	    (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+		mutex_unlock(&ses->session_mutex);
+
+		/*
+		 * query server network interfaces, in case they change
+		 */
+		xid = get_xid();
+		rc = SMB3_request_interfaces(xid, tcon, false);
+		free_xid(xid);
+
+		if (rc)
+			cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+				 __func__, rc);
+
+		if (ses->chan_max > ses->chan_count &&
+		    !SERVER_IS_CHAN(server)) {
+			if (ses->chan_count == 1)
+				cifs_server_dbg(VFS, "supports multichannel now\n");
+
+			cifs_try_adding_channels(ses);
+		}
+	} else {
+		mutex_unlock(&ses->session_mutex);
+	}
+
 	if (smb2_command != SMB2_INTERNAL_CMD)
 		mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
 
@@ -376,10 +470,15 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
 				 void **request_buf, unsigned int *total_len)
 {
 	/* BB eventually switch this to SMB2 specific small buf size */
-	if (smb2_command == SMB2_SET_INFO)
+	switch (smb2_command) {
+	case SMB2_SET_INFO:
+	case SMB2_QUERY_INFO:
 		*request_buf = cifs_buf_get();
-	else
+		break;
+	default:
 		*request_buf = cifs_small_buf_get();
+		break;
+	}
 	if (*request_buf == NULL) {
 		/* BB should we add a retry in here if not a writepage? */
 		return -ENOMEM;
@@ -404,7 +503,7 @@ static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
 {
 	int rc;
 
-	rc = smb2_reconnect(smb2_command, tcon, server);
+	rc = smb2_reconnect(smb2_command, tcon, server, false);
 	if (rc)
 		return rc;
 
@@ -2141,17 +2240,18 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
 		 posix->nlink, posix->mode, posix->reparse_tag);
 }
 
-void
-smb2_parse_contexts(struct TCP_Server_Info *server,
-		    struct smb2_create_rsp *rsp,
-		    unsigned int *epoch, char *lease_key, __u8 *oplock,
-		    struct smb2_file_all_info *buf,
-		    struct create_posix_rsp *posix)
+int smb2_parse_contexts(struct TCP_Server_Info *server,
+			struct kvec *rsp_iov,
+			unsigned int *epoch,
+			char *lease_key, __u8 *oplock,
+			struct smb2_file_all_info *buf,
+			struct create_posix_rsp *posix)
 {
-	char *data_offset;
+	struct smb2_create_rsp *rsp = rsp_iov->iov_base;
 	struct create_context *cc;
-	unsigned int next;
-	unsigned int remaining;
+	size_t rem, off, len;
+	size_t doff, dlen;
+	size_t noff, nlen;
 	char *name;
 	static const char smb3_create_tag_posix[] = {
 		0x93, 0xAD, 0x25, 0x50, 0x9C,
@@ -2160,45 +2260,63 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
 	};
 
 	*oplock = 0;
-	data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset);
-	remaining = le32_to_cpu(rsp->CreateContextsLength);
-	cc = (struct create_context *)data_offset;
+
+	off = le32_to_cpu(rsp->CreateContextsOffset);
+	rem = le32_to_cpu(rsp->CreateContextsLength);
+	if (check_add_overflow(off, rem, &len) || len > rsp_iov->iov_len)
+		return -EINVAL;
+	cc = (struct create_context *)((u8 *)rsp + off);
 
 	/* Initialize inode number to 0 in case no valid data in qfid context */
 	if (buf)
 		buf->IndexNumber = 0;
 
-	while (remaining >= sizeof(struct create_context)) {
-		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
-		if (le16_to_cpu(cc->NameLength) == 4 &&
-		    strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4) == 0)
-			*oplock = server->ops->parse_lease_buf(cc, epoch,
-							   lease_key);
-		else if (buf && (le16_to_cpu(cc->NameLength) == 4) &&
-		    strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0)
-			parse_query_id_ctxt(cc, buf);
-		else if ((le16_to_cpu(cc->NameLength) == 16)) {
-			if (posix &&
-			    memcmp(name, smb3_create_tag_posix, 16) == 0)
+	while (rem >= sizeof(*cc)) {
+		doff = le16_to_cpu(cc->DataOffset);
+		dlen = le32_to_cpu(cc->DataLength);
+		if (check_add_overflow(doff, dlen, &len) || len > rem)
+			return -EINVAL;
+
+		noff = le16_to_cpu(cc->NameOffset);
+		nlen = le16_to_cpu(cc->NameLength);
+		if (noff + nlen >= doff)
+			return -EINVAL;
+
+		name = (char *)cc + noff;
+		switch (nlen) {
+		case 4:
+			if (!strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) {
+				*oplock = server->ops->parse_lease_buf(cc, epoch,
+								       lease_key);
+			} else if (buf &&
+				   !strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4)) {
+				parse_query_id_ctxt(cc, buf);
+			}
+			break;
+		case 16:
+			if (posix && !memcmp(name, smb3_create_tag_posix, 16))
 				parse_posix_ctxt(cc, buf, posix);
+			break;
+		default:
+			cifs_dbg(FYI, "%s: unhandled context (nlen=%zu dlen=%zu)\n",
+				 __func__, nlen, dlen);
+			if (IS_ENABLED(CONFIG_CIFS_DEBUG2))
+				cifs_dump_mem("context data: ", cc, dlen);
+			break;
 		}
-		/* else {
-			cifs_dbg(FYI, "Context not matched with len %d\n",
-				le16_to_cpu(cc->NameLength));
-			cifs_dump_mem("Cctxt name: ", name, 4);
-		} */
-
-		next = le32_to_cpu(cc->Next);
-		if (!next)
+
+		off = le32_to_cpu(cc->Next);
+		if (!off)
 			break;
-		remaining -= next;
-		cc = (struct create_context *)((char *)cc + next);
+		if (check_sub_overflow(rem, off, &rem))
+			return -EINVAL;
+		cc = (struct create_context *)((u8 *)cc + off);
 	}
 
 	if (rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
 		*oplock = rsp->OplockLevel;
 
-	return;
+	return 0;
 }
 
 static int
@@ -3029,8 +3147,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	}
 
 
-	smb2_parse_contexts(server, rsp, &oparms->fid->epoch,
-			    oparms->fid->lease_key, oplock, buf, posix);
+	rc = smb2_parse_contexts(server, &rsp_iov, &oparms->fid->epoch,
+				 oparms->fid->lease_key, oplock, buf, posix);
 creat_exit:
 	SMB2_open_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
@@ -3377,12 +3495,10 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	} else {
 		trace_smb3_close_done(xid, persistent_fid, tcon->tid,
 				      ses->Suid);
-		/*
-		 * Note that have to subtract 4 since struct network_open_info
-		 * has a final 4 byte pad that close response does not have
-		 */
 		if (pbuf)
-			memcpy(pbuf, (char *)&rsp->CreationTime, sizeof(*pbuf) - 4);
+			memcpy(&pbuf->network_open_info,
+			       &rsp->network_open_info,
+			       sizeof(pbuf->network_open_info));
 	}
 
 	atomic_dec(&tcon->num_remote_opens);
@@ -3475,8 +3591,13 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 	struct smb2_query_info_req *req;
 	struct kvec *iov = rqst->rq_iov;
 	unsigned int total_len;
+	size_t len;
 	int rc;
 
+	if (unlikely(check_add_overflow(input_len, sizeof(*req), &len) ||
+		     len > CIFSMaxBufSize))
+		return -EINVAL;
+
 	rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, server,
 				 (void **) &req, &total_len);
 	if (rc)
@@ -3498,7 +3619,7 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 
 	iov[0].iov_base = (char *)req;
 	/* 1 for Buffer */
-	iov[0].iov_len = total_len - 1 + input_len;
+	iov[0].iov_len = len;
 	return 0;
 }
 
@@ -3506,7 +3627,7 @@ void
 SMB2_query_info_free(struct smb_rqst *rqst)
 {
 	if (rqst && rqst->rq_iov)
-		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+		cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */
 }
 
 static int
@@ -3802,12 +3923,28 @@ void smb2_reconnect_server(struct work_struct *work)
 	int rc;
 	bool resched = false;
 
+	/* first check if ref count has reached 0, if not inc ref count */
+	spin_lock(&cifs_tcp_ses_lock);
+	if (!server->srv_count) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+	server->srv_count++;
+	spin_unlock(&cifs_tcp_ses_lock);
+
 	/* If server is a channel, select the primary channel */
 	pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
 
 	/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
 	mutex_lock(&pserver->reconnect_mutex);
 
+	/* if the server is marked for termination, drop the ref count here */
+	if (server->terminate) {
+		cifs_put_tcp_session(server, true);
+		mutex_unlock(&pserver->reconnect_mutex);
+		return;
+	}
+
 	INIT_LIST_HEAD(&tmp_list);
 	INIT_LIST_HEAD(&tmp_ses_list);
 	cifs_dbg(FYI, "Reconnecting tcons and channels\n");
@@ -3852,17 +3989,10 @@ void smb2_reconnect_server(struct work_struct *work)
 		}
 		spin_unlock(&ses->chan_lock);
 	}
-	/*
-	 * Get the reference to server struct to be sure that the last call of
-	 * cifs_put_tcon() in the loop below won't release the server pointer.
-	 */
-	if (tcon_exist || ses_exist)
-		server->srv_count++;
-
 	spin_unlock(&cifs_tcp_ses_lock);
 
 	list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
-		rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server);
+		rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
 		if (!rc)
 			cifs_reopen_persistent_handles(tcon);
 		else
@@ -3895,7 +4025,7 @@ void smb2_reconnect_server(struct work_struct *work)
 	/* now reconnect sessions for necessary channels */
 	list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
 		tcon->ses = ses;
-		rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server);
+		rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
 		if (rc)
 			resched = true;
 		list_del_init(&ses->rlist);
@@ -3910,8 +4040,7 @@ done:
 	mutex_unlock(&pserver->reconnect_mutex);
 
 	/* now we can safely release srv struct */
-	if (tcon_exist || ses_exist)
-		cifs_put_tcp_session(server, 1);
+	cifs_put_tcp_session(server, true);
 }
 
 int
@@ -5373,6 +5502,11 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon,
 	return 0;
 }
 
+static inline void free_qfs_info_req(struct kvec *iov)
+{
+	cifs_buf_release(iov->iov_base);
+}
+
 int
 SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
 	      u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
@@ -5404,7 +5538,7 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(iov.iov_base);
+	free_qfs_info_req(&iov);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
 		goto posix_qfsinf_exit;
@@ -5455,7 +5589,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(iov.iov_base);
+	free_qfs_info_req(&iov);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
 		goto qfsinf_exit;
@@ -5522,7 +5656,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = cifs_send_recv(xid, ses, server,
 			    &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(iov.iov_base);
+	free_qfs_info_req(&iov);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
 		goto qfsattr_exit;
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index 220994d0a0f7..db08194484e0 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -319,13 +319,15 @@ struct smb2_file_reparse_point_info {
 } __packed;
 
 struct smb2_file_network_open_info {
-	__le64 CreationTime;
-	__le64 LastAccessTime;
-	__le64 LastWriteTime;
-	__le64 ChangeTime;
-	__le64 AllocationSize;
-	__le64 EndOfFile;
-	__le32 Attributes;
+	struct_group(network_open_info,
+		__le64 CreationTime;
+		__le64 LastAccessTime;
+		__le64 LastWriteTime;
+		__le64 ChangeTime;
+		__le64 AllocationSize;
+		__le64 EndOfFile;
+		__le32 Attributes;
+	);
 	__le32 Reserved;
 } __packed; /* level 34 Query also similar returned in close rsp and open rsp */
 
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 46eff9ec302a..0e371f7e2854 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -251,11 +251,13 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
 
 extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
 					enum securityEnum);
-extern void smb2_parse_contexts(struct TCP_Server_Info *server,
-				struct smb2_create_rsp *rsp,
-				unsigned int *epoch, char *lease_key,
-				__u8 *oplock, struct smb2_file_all_info *buf,
-				struct create_posix_rsp *posix);
+int smb2_parse_contexts(struct TCP_Server_Info *server,
+			struct kvec *rsp_iov,
+			unsigned int *epoch,
+			char *lease_key, __u8 *oplock,
+			struct smb2_file_all_info *buf,
+			struct create_posix_rsp *posix);
+
 extern int smb3_encryption_required(const struct cifs_tcon *tcon);
 extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
 			     struct kvec *iov, unsigned int min_buf_size);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 23c50ed7d4b5..5a3ca62d2f07 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -413,7 +413,13 @@ generate_smb3signingkey(struct cifs_ses *ses,
 		      ses->ses_status == SES_GOOD);
 
 	chan_index = cifs_ses_get_chan_index(ses, server);
-	/* TODO: introduce ref counting for channels when the can be freed */
+	if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+		spin_unlock(&ses->chan_lock);
+		spin_unlock(&ses->ses_lock);
+
+		return -EINVAL;
+	}
+
 	spin_unlock(&ses->chan_lock);
 	spin_unlock(&ses->ses_lock);
 
@@ -452,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
 				  ptriplet->encryption.context,
 				  ses->smb3encryptionkey,
 				  SMB3_ENC_DEC_KEY_SIZE);
+		if (rc)
+			return rc;
 		rc = generate_key(ses, ptriplet->decryption.label,
 				  ptriplet->decryption.context,
 				  ses->smb3decryptionkey,
@@ -460,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
 			return rc;
 	}
 
-	if (rc)
-		return rc;
-
 #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
 	cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
 	/*
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 14710afdc2a3..4f717ad7c21b 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -76,7 +76,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	return temp;
 }
 
-static void __release_mid(struct kref *refcount)
+void __release_mid(struct kref *refcount)
 {
 	struct mid_q_entry *midEntry =
 			container_of(refcount, struct mid_q_entry, refcount);
@@ -156,15 +156,6 @@ static void __release_mid(struct kref *refcount)
 	mempool_free(midEntry, cifs_mid_poolp);
 }
 
-void release_mid(struct mid_q_entry *mid)
-{
-	struct TCP_Server_Info *server = mid->server;
-
-	spin_lock(&server->mid_lock);
-	kref_put(&mid->refcount, __release_mid);
-	spin_unlock(&server->mid_lock);
-}
-
 void
 delete_mid(struct mid_q_entry *mid)
 {
@@ -1032,7 +1023,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
 	spin_lock(&ses->chan_lock);
 	for (i = 0; i < ses->chan_count; i++) {
 		server = ses->chans[i].server;
-		if (!server)
+		if (!server || server->terminate)
 			continue;
 
 		/*
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 4ad5531686d8..6780aa3e98a1 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -150,10 +150,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto out;
 
-		if (pTcon->ses->server->ops->set_EA)
+		if (pTcon->ses->server->ops->set_EA) {
 			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
 				full_path, name, value, (__u16)size,
 				cifs_sb->local_nls, cifs_sb);
+			if (rc == 0)
+				inode_set_ctime_current(inode);
+		}
 		break;
 
 	case XATTR_CIFS_ACL:
@@ -478,7 +481,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
 	.set = cifs_xattr_set,
 };
 
-const struct xattr_handler *cifs_xattr_handlers[] = {
+const struct xattr_handler * const cifs_xattr_handlers[] = {
 	&cifs_user_xattr_handler,
 	&cifs_os2_xattr_handler,
 	&cifs_cifs_acl_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 319fb9ffc6a0..57f2343164a3 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -34,6 +34,7 @@
 #define SMB2_QUERY_INFO_HE	0x0010
 #define SMB2_SET_INFO_HE	0x0011
 #define SMB2_OPLOCK_BREAK_HE	0x0012
+#define SMB2_SERVER_TO_CLIENT_NOTIFICATION 0x0013
 
 /* The same list in little endian */
 #define SMB2_NEGOTIATE		cpu_to_le16(SMB2_NEGOTIATE_HE)
@@ -411,6 +412,7 @@ struct smb2_tree_disconnect_rsp {
 #define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
 #define SMB2_GLOBAL_CAP_DIRECTORY_LEASING  0x00000020 /* New to SMB3 */
 #define SMB2_GLOBAL_CAP_ENCRYPTION	0x00000040 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_NOTIFICATIONS	0x00000080 /* New to SMB3.1.1 */
 /* Internal types */
 #define SMB2_NT_FIND			0x00100000
 #define SMB2_LARGE_FILES		0x00200000
@@ -700,13 +702,16 @@ struct smb2_close_rsp {
 	__le16 StructureSize; /* 60 */
 	__le16 Flags;
 	__le32 Reserved;
-	__le64 CreationTime;
-	__le64 LastAccessTime;
-	__le64 LastWriteTime;
-	__le64 ChangeTime;
-	__le64 AllocationSize;	/* Beginning of FILE_STANDARD_INFO equivalent */
-	__le64 EndOfFile;
-	__le32 Attributes;
+	struct_group(network_open_info,
+		__le64 CreationTime;
+		__le64 LastAccessTime;
+		__le64 LastWriteTime;
+		__le64 ChangeTime;
+		/* Beginning of FILE_STANDARD_INFO equivalent */
+		__le64 AllocationSize;
+		__le64 EndOfFile;
+		__le32 Attributes;
+	);
 } __packed;
 
 
@@ -981,6 +986,19 @@ struct smb2_change_notify_rsp {
 	__u8	Buffer[]; /* array of file notify structs */
 } __packed;
 
+/*
+ * SMB2_SERVER_TO_CLIENT_NOTIFICATION: See MS-SMB2 section 2.2.44
+ */
+
+#define SMB2_NOTIFY_SESSION_CLOSED	0x0000
+
+struct smb2_server_client_notification {
+	struct smb2_hdr hdr;
+	__le16	StructureSize;
+	__u16	Reserved; /* MBZ */
+	__le32	NotificationType;
+	__u8	NotificationBuffer[4]; /* MBZ */
+} __packed;
 
 /*
  * SMB2_CREATE  See MS-SMB2 section 2.2.13
@@ -1097,16 +1115,23 @@ struct smb2_change_notify_rsp {
 #define FILE_WRITE_THROUGH_LE		cpu_to_le32(0x00000002)
 #define FILE_SEQUENTIAL_ONLY_LE		cpu_to_le32(0x00000004)
 #define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+/* FILE_SYNCHRONOUS_IO_ALERT_LE		cpu_to_le32(0x00000010) should be zero, ignored */
+/* FILE_SYNCHRONOUS_IO_NONALERT		cpu_to_le32(0x00000020) should be zero, ignored */
 #define FILE_NON_DIRECTORY_FILE_LE	cpu_to_le32(0x00000040)
 #define FILE_COMPLETE_IF_OPLOCKED_LE	cpu_to_le32(0x00000100)
 #define FILE_NO_EA_KNOWLEDGE_LE		cpu_to_le32(0x00000200)
+/* FILE_OPEN_REMOTE_INSTANCE		cpu_to_le32(0x00000400) should be zero, ignored */
 #define FILE_RANDOM_ACCESS_LE		cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE		cpu_to_le32(0x00001000)
+#define FILE_DELETE_ON_CLOSE_LE		cpu_to_le32(0x00001000) /* MBZ */
 #define FILE_OPEN_BY_FILE_ID_LE		cpu_to_le32(0x00002000)
 #define FILE_OPEN_FOR_BACKUP_INTENT_LE	cpu_to_le32(0x00004000)
 #define FILE_NO_COMPRESSION_LE		cpu_to_le32(0x00008000)
+/* FILE_OPEN_REQUIRING_OPLOCK		cpu_to_le32(0x00010000) should be zero, ignored */
+/* FILE_DISALLOW_EXCLUSIVE		cpu_to_le32(0x00020000) should be zero, ignored */
+/* FILE_RESERVE_OPFILTER		cpu_to_le32(0x00100000) MBZ */
 #define FILE_OPEN_REPARSE_POINT_LE	cpu_to_le32(0x00200000)
 #define FILE_OPEN_NO_RECALL_LE		cpu_to_le32(0x00400000)
+/* #define FILE_OPEN_FOR_FREE_SPACE_QUERY cpu_to_le32(0x00800000) should be zero, ignored */
 #define CREATE_OPTIONS_MASK_LE          cpu_to_le32(0x00FFFFFF)
 
 #define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
@@ -1120,7 +1145,7 @@ struct smb2_change_notify_rsp {
 #define SMB2_CREATE_SD_BUFFER			"SecD" /* security descriptor */
 #define SMB2_CREATE_DURABLE_HANDLE_REQUEST	"DHnQ"
 #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT	"DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE		"AISi"
+#define SMB2_CREATE_ALLOCATION_SIZE		"AlSi"
 #define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
 #define SMB2_CREATE_TIMEWARP_REQUEST		"TWrp"
 #define SMB2_CREATE_QUERY_ON_DISK_ID		"QFid"
@@ -1228,6 +1253,7 @@ struct create_mxac_rsp {
 #define SMB2_LEASE_WRITE_CACHING_LE		cpu_to_le32(0x04)
 
 #define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE	cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE	cpu_to_le32(0x04)
 
 #define SMB2_LEASE_KEY_SIZE			16
 
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 4b38c3a285f6..b6fa1e285c40 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -167,23 +167,7 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
 
 void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id)
 {
-	struct ksmbd_conn *bind_conn;
-
 	wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2);
-
-	down_read(&conn_list_lock);
-	list_for_each_entry(bind_conn, &conn_list, conns_list) {
-		if (bind_conn == conn)
-			continue;
-
-		if ((bind_conn->binding || xa_load(&bind_conn->sessions, sess_id)) &&
-		    !ksmbd_conn_releasing(bind_conn) &&
-		    atomic_read(&bind_conn->req_running)) {
-			wait_event(bind_conn->req_running_q,
-				atomic_read(&bind_conn->req_running) == 0);
-		}
-	}
-	up_read(&conn_list_lock);
 }
 
 int ksmbd_conn_write(struct ksmbd_work *work)
diff --git a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
index 0065f191b54b..001513806fc0 100644
--- a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1
@@ -1,3 +1,11 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998, 2000 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+-- https://www.rfc-editor.org/rfc/rfc2743#section-3.1
+
 GSSAPI ::=
 	[APPLICATION 0] IMPLICIT SEQUENCE {
 		thisMech
diff --git a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
index 1151933e7b9c..797e485d57f1 100644
--- a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
+++ b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1
@@ -1,3 +1,10 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1998 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1
+
 GSSAPI ::=
 	CHOICE {
 		negTokenInit
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 51def3ca74c0..d7c676c151e2 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -56,6 +56,9 @@ void ksmbd_free_work_struct(struct ksmbd_work *work)
 	kfree(work->tr_buf);
 	kvfree(work->request_buf);
 	kfree(work->iov);
+	if (!list_empty(&work->interim_entry))
+		list_del(&work->interim_entry);
+
 	if (work->async_id)
 		ksmbd_release_id(&work->conn->async_ida, work->async_id);
 	kmem_cache_free(work_cache, work);
@@ -95,32 +98,42 @@ bool ksmbd_queue_work(struct ksmbd_work *work)
 	return queue_work(ksmbd_wq, &work->work);
 }
 
-static int ksmbd_realloc_iov_pin(struct ksmbd_work *work, void *ib,
-				 unsigned int ib_len)
+static inline void __ksmbd_iov_pin(struct ksmbd_work *work, void *ib,
+				   unsigned int ib_len)
+{
+	work->iov[++work->iov_idx].iov_base = ib;
+	work->iov[work->iov_idx].iov_len = ib_len;
+	work->iov_cnt++;
+}
+
+static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
+			       void *aux_buf, unsigned int aux_size)
 {
+	struct aux_read *ar = NULL;
+	int need_iov_cnt = 1;
+
+	if (aux_size) {
+		need_iov_cnt++;
+		ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL);
+		if (!ar)
+			return -ENOMEM;
+	}
 
-	if (work->iov_alloc_cnt <= work->iov_cnt) {
+	if (work->iov_alloc_cnt < work->iov_cnt + need_iov_cnt) {
 		struct kvec *new;
 
 		work->iov_alloc_cnt += 4;
 		new = krealloc(work->iov,
 			       sizeof(struct kvec) * work->iov_alloc_cnt,
 			       GFP_KERNEL | __GFP_ZERO);
-		if (!new)
+		if (!new) {
+			kfree(ar);
+			work->iov_alloc_cnt -= 4;
 			return -ENOMEM;
+		}
 		work->iov = new;
 	}
 
-	work->iov[++work->iov_idx].iov_base = ib;
-	work->iov[work->iov_idx].iov_len = ib_len;
-	work->iov_cnt++;
-
-	return 0;
-}
-
-static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
-			       void *aux_buf, unsigned int aux_size)
-{
 	/* Plus rfc_length size on first iov */
 	if (!work->iov_idx) {
 		work->iov[work->iov_idx].iov_base = work->response_buf;
@@ -129,19 +142,13 @@ static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
 		work->iov_cnt++;
 	}
 
-	ksmbd_realloc_iov_pin(work, ib, len);
+	__ksmbd_iov_pin(work, ib, len);
 	inc_rfc1001_len(work->iov[0].iov_base, len);
 
 	if (aux_size) {
-		struct aux_read *ar;
-
-		ksmbd_realloc_iov_pin(work, aux_buf, aux_size);
+		__ksmbd_iov_pin(work, aux_buf, aux_size);
 		inc_rfc1001_len(work->iov[0].iov_base, aux_size);
 
-		ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL);
-		if (!ar)
-			return -ENOMEM;
-
 		ar->buf = aux_buf;
 		list_add(&ar->entry, &work->aux_read_list);
 	}
diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h
index 6a44109617f1..e068a19fd904 100644
--- a/fs/smb/server/mgmt/user_config.h
+++ b/fs/smb/server/mgmt/user_config.h
@@ -18,7 +18,6 @@ struct ksmbd_user {
 
 	size_t			passkey_sz;
 	char			*passkey;
-	unsigned int		failed_login_count;
 };
 
 static inline bool user_guest(struct ksmbd_user *user)
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 9bc0103720f5..562b180459a1 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -102,9 +102,10 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx)
 	lease->new_state = 0;
 	lease->flags = lctx->flags;
 	lease->duration = lctx->duration;
+	lease->is_dir = lctx->is_dir;
 	memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE);
 	lease->version = lctx->version;
-	lease->epoch = 0;
+	lease->epoch = le16_to_cpu(lctx->epoch);
 	INIT_LIST_HEAD(&opinfo->lease_entry);
 	opinfo->o_lease = lease;
 
@@ -395,8 +396,8 @@ void close_id_del_oplock(struct ksmbd_file *fp)
 {
 	struct oplock_info *opinfo;
 
-	if (S_ISDIR(file_inode(fp->filp)->i_mode))
-		return;
+	if (fp->reserve_lease_break)
+		smb_lazy_parent_lease_break_close(fp);
 
 	opinfo = opinfo_get(fp);
 	if (!opinfo)
@@ -543,12 +544,13 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
 			/* upgrading lease */
 			if ((atomic_read(&ci->op_count) +
 			     atomic_read(&ci->sop_count)) == 1) {
-				if (lease->state ==
-				    (lctx->req_state & lease->state)) {
+				if (lease->state != SMB2_LEASE_NONE_LE &&
+				    lease->state == (lctx->req_state & lease->state)) {
 					lease->state |= lctx->req_state;
 					if (lctx->req_state &
 						SMB2_LEASE_WRITE_CACHING_LE)
 						lease_read_to_write(opinfo);
+
 				}
 			} else if ((atomic_read(&ci->op_count) +
 				    atomic_read(&ci->sop_count)) > 1) {
@@ -833,7 +835,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo)
 					     interim_entry);
 			setup_async_work(in_work, NULL, NULL);
 			smb2_send_interim_resp(in_work, STATUS_PENDING);
-			list_del(&in_work->interim_entry);
+			list_del_init(&in_work->interim_entry);
+			release_async_work(in_work);
 		}
 		INIT_WORK(&work->work, __smb2_lease_break_noti);
 		ksmbd_queue_work(work);
@@ -899,7 +902,8 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
 					lease->new_state =
 						SMB2_LEASE_READ_CACHING_LE;
 			} else {
-				if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+				if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE &&
+						!lease->is_dir)
 					lease->new_state =
 						SMB2_LEASE_READ_CACHING_LE;
 				else
@@ -1031,6 +1035,7 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2)
 	       SMB2_LEASE_KEY_SIZE);
 	lease2->duration = lease1->duration;
 	lease2->flags = lease1->flags;
+	lease2->epoch = lease1->epoch++;
 }
 
 static int add_lease_global_list(struct oplock_info *opinfo)
@@ -1080,6 +1085,89 @@ static void set_oplock_level(struct oplock_info *opinfo, int level,
 	}
 }
 
+void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
+				      struct lease_ctx_info *lctx)
+{
+	struct oplock_info *opinfo;
+	struct ksmbd_inode *p_ci = NULL;
+
+	if (lctx->version != 2)
+		return;
+
+	p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
+	if (!p_ci)
+		return;
+
+	read_lock(&p_ci->m_lock);
+	list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
+		if (!opinfo->is_lease)
+			continue;
+
+		if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE &&
+		    (!(lctx->flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) ||
+		     !compare_guid_key(opinfo, fp->conn->ClientGUID,
+				      lctx->parent_lease_key))) {
+			if (!atomic_inc_not_zero(&opinfo->refcount))
+				continue;
+
+			atomic_inc(&opinfo->conn->r_count);
+			if (ksmbd_conn_releasing(opinfo->conn)) {
+				atomic_dec(&opinfo->conn->r_count);
+				continue;
+			}
+
+			read_unlock(&p_ci->m_lock);
+			oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+			opinfo_conn_put(opinfo);
+			read_lock(&p_ci->m_lock);
+		}
+	}
+	read_unlock(&p_ci->m_lock);
+
+	ksmbd_inode_put(p_ci);
+}
+
+void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
+{
+	struct oplock_info *opinfo;
+	struct ksmbd_inode *p_ci = NULL;
+
+	rcu_read_lock();
+	opinfo = rcu_dereference(fp->f_opinfo);
+	rcu_read_unlock();
+
+	if (!opinfo->is_lease || opinfo->o_lease->version != 2)
+		return;
+
+	p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
+	if (!p_ci)
+		return;
+
+	read_lock(&p_ci->m_lock);
+	list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
+		if (!opinfo->is_lease)
+			continue;
+
+		if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) {
+			if (!atomic_inc_not_zero(&opinfo->refcount))
+				continue;
+
+			atomic_inc(&opinfo->conn->r_count);
+			if (ksmbd_conn_releasing(opinfo->conn)) {
+				atomic_dec(&opinfo->conn->r_count);
+				continue;
+			}
+			read_unlock(&p_ci->m_lock);
+			oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+			opinfo_conn_put(opinfo);
+			read_lock(&p_ci->m_lock);
+		}
+	}
+	read_unlock(&p_ci->m_lock);
+
+	ksmbd_inode_put(p_ci);
+}
+
 /**
  * smb_grant_oplock() - handle oplock/lease request on file open
  * @work:		smb work
@@ -1103,10 +1191,6 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
 	bool prev_op_has_lease;
 	__le32 prev_op_state = 0;
 
-	/* not support directory lease */
-	if (S_ISDIR(file_inode(fp->filp)->i_mode))
-		return 0;
-
 	opinfo = alloc_opinfo(work, pid, tid);
 	if (!opinfo)
 		return -ENOMEM;
@@ -1363,6 +1447,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
 		memcpy(buf->lcontext.LeaseKey, lease->lease_key,
 		       SMB2_LEASE_KEY_SIZE);
 		buf->lcontext.LeaseFlags = lease->flags;
+		buf->lcontext.Epoch = cpu_to_le16(++lease->epoch);
 		buf->lcontext.LeaseState = lease->state;
 		memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
 		       SMB2_LEASE_KEY_SIZE);
@@ -1399,10 +1484,11 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
 /**
  * parse_lease_state() - parse lease context containted in file open request
  * @open_req:	buffer containing smb2 file open(create) request
+ * @is_dir:	whether leasing file is directory
  *
  * Return:  oplock state, -ENOENT if create lease context not found
  */
-struct lease_ctx_info *parse_lease_state(void *open_req)
+struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
 {
 	struct create_context *cc;
 	struct smb2_create_req *req = (struct smb2_create_req *)open_req;
@@ -1420,8 +1506,14 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
 		struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
 
 		memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
-		lreq->req_state = lc->lcontext.LeaseState;
+		if (is_dir) {
+			lreq->req_state = lc->lcontext.LeaseState &
+				~SMB2_LEASE_WRITE_CACHING_LE;
+			lreq->is_dir = true;
+		} else
+			lreq->req_state = lc->lcontext.LeaseState;
 		lreq->flags = lc->lcontext.LeaseFlags;
+		lreq->epoch = lc->lcontext.Epoch;
 		lreq->duration = lc->lcontext.LeaseDuration;
 		memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey,
 				SMB2_LEASE_KEY_SIZE);
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 4b0fe6da7694..5b93ea9196c0 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -34,7 +34,9 @@ struct lease_ctx_info {
 	__le32			flags;
 	__le64			duration;
 	__u8			parent_lease_key[SMB2_LEASE_KEY_SIZE];
+	__le16			epoch;
 	int			version;
+	bool			is_dir;
 };
 
 struct lease_table {
@@ -53,6 +55,7 @@ struct lease {
 	__u8			parent_lease_key[SMB2_LEASE_KEY_SIZE];
 	int			version;
 	unsigned short		epoch;
+	bool			is_dir;
 	struct lease_table	*l_lb;
 };
 
@@ -108,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo);
 
 /* Lease related functions */
 void create_lease_buf(u8 *rbuf, struct lease *lease);
-struct lease_ctx_info *parse_lease_state(void *open_req);
+struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir);
 __u8 smb2_map_lease_to_oplock(__le32 lease_state);
 int lease_read_to_write(struct oplock_info *opinfo);
 
@@ -124,4 +127,7 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
 int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
 			struct lease_ctx_info *lctx);
 void destroy_lease_table(struct ksmbd_conn *conn);
+void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
+				      struct lease_ctx_info *lctx);
+void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp);
 #endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 23bd3d1209df..03dded29a980 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -106,16 +106,25 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 		break;
 	case SMB2_CREATE:
 	{
+		unsigned short int name_off =
+			le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+		unsigned short int name_len =
+			le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
+
 		if (((struct smb2_create_req *)hdr)->CreateContextsLength) {
 			*off = le32_to_cpu(((struct smb2_create_req *)
 				hdr)->CreateContextsOffset);
 			*len = le32_to_cpu(((struct smb2_create_req *)
 				hdr)->CreateContextsLength);
-			break;
+			if (!name_len)
+				break;
+
+			if (name_off + name_len < (u64)*off + *len)
+				break;
 		}
 
-		*off = le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
-		*len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
+		*off = name_off;
+		*len = name_len;
 		break;
 	}
 	case SMB2_QUERY_INFO:
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index aed7704a0672..27a9dce3e03a 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -221,7 +221,8 @@ void init_smb3_0_server(struct ksmbd_conn *conn)
 	conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
-		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+			SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
 	    conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
@@ -245,7 +246,8 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
 	conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
-		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+			SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
 	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
@@ -270,7 +272,8 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
 	conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
-		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
+			SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
 	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 93262ca3f58a..652ab429bf2e 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -657,13 +657,9 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
 
 int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
 {
-	struct smb2_hdr *rsp_hdr;
 	struct ksmbd_conn *conn = work->conn;
 	int id;
 
-	rsp_hdr = ksmbd_resp_buf_next(work);
-	rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
-
 	id = ksmbd_acquire_async_msg_id(&conn->async_ida);
 	if (id < 0) {
 		pr_err("Failed to alloc async message id\n");
@@ -671,7 +667,6 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
 	}
 	work->asynchronous = true;
 	work->async_id = id;
-	rsp_hdr->Id.AsyncId = cpu_to_le64(id);
 
 	ksmbd_debug(SMB,
 		    "Send interim Response to inform async request id : %d\n",
@@ -723,6 +718,8 @@ void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
 	       __SMB2_HEADER_STRUCTURE_SIZE);
 
 	rsp_hdr = smb2_get_msg(in_work->response_buf);
+	rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
+	rsp_hdr->Id.AsyncId = cpu_to_le64(work->async_id);
 	smb2_set_err_rsp(in_work);
 	rsp_hdr->Status = status;
 
@@ -2380,7 +2377,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
 			rc = 0;
 		} else {
 			rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
-						le16_to_cpu(eabuf->EaValueLength), 0);
+						le16_to_cpu(eabuf->EaValueLength),
+						0, true);
 			if (rc < 0) {
 				ksmbd_debug(SMB,
 					    "ksmbd_vfs_setxattr is failed(%d)\n",
@@ -2443,7 +2441,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path,
 		return -EBADF;
 	}
 
-	rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0);
+	rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0, false);
 	if (rc < 0)
 		pr_err("Failed to store XATTR stream name :%d\n", rc);
 	return 0;
@@ -2518,7 +2516,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *
 	da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
 		XATTR_DOSINFO_ITIME;
 
-	rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da);
+	rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da, true);
 	if (rc)
 		ksmbd_debug(SMB, "failed to store file attribute into xattr\n");
 }
@@ -2608,7 +2606,7 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work,
 	    sizeof(struct create_sd_buf_req))
 		return -EINVAL;
 	return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd,
-			    le32_to_cpu(sd_buf->ccontext.DataLength), true);
+			    le32_to_cpu(sd_buf->ccontext.DataLength), true, false);
 }
 
 static void ksmbd_acls_fattr(struct smb_fattr *fattr,
@@ -2690,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work)
 		    *(char *)req->Buffer == '\\') {
 			pr_err("not allow directory name included leading slash\n");
 			rc = -EINVAL;
-			goto err_out1;
+			goto err_out2;
 		}
 
 		name = smb2_get_name(req->Buffer,
@@ -2701,7 +2699,7 @@ int smb2_open(struct ksmbd_work *work)
 			if (rc != -ENOMEM)
 				rc = -ENOENT;
 			name = NULL;
-			goto err_out1;
+			goto err_out2;
 		}
 
 		ksmbd_debug(SMB, "converted name = %s\n", name);
@@ -2709,48 +2707,44 @@ int smb2_open(struct ksmbd_work *work)
 			if (!test_share_config_flag(work->tcon->share_conf,
 						    KSMBD_SHARE_FLAG_STREAMS)) {
 				rc = -EBADF;
-				goto err_out1;
+				goto err_out2;
 			}
 			rc = parse_stream_name(name, &stream_name, &s_type);
 			if (rc < 0)
-				goto err_out1;
+				goto err_out2;
 		}
 
 		rc = ksmbd_validate_filename(name);
 		if (rc < 0)
-			goto err_out1;
+			goto err_out2;
 
 		if (ksmbd_share_veto_filename(share, name)) {
 			rc = -ENOENT;
 			ksmbd_debug(SMB, "Reject open(), vetoed file: %s\n",
 				    name);
-			goto err_out1;
+			goto err_out2;
 		}
 	} else {
 		name = kstrdup("", GFP_KERNEL);
 		if (!name) {
 			rc = -ENOMEM;
-			goto err_out1;
+			goto err_out2;
 		}
 	}
 
-	req_op_level = req->RequestedOplockLevel;
-	if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
-		lc = parse_lease_state(req);
-
 	if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
 		pr_err("Invalid impersonationlevel : 0x%x\n",
 		       le32_to_cpu(req->ImpersonationLevel));
 		rc = -EIO;
 		rsp->hdr.Status = STATUS_BAD_IMPERSONATION_LEVEL;
-		goto err_out1;
+		goto err_out2;
 	}
 
 	if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK_LE)) {
 		pr_err("Invalid create options : 0x%x\n",
 		       le32_to_cpu(req->CreateOptions));
 		rc = -EINVAL;
-		goto err_out1;
+		goto err_out2;
 	} else {
 		if (req->CreateOptions & FILE_SEQUENTIAL_ONLY_LE &&
 		    req->CreateOptions & FILE_RANDOM_ACCESS_LE)
@@ -2760,13 +2754,13 @@ int smb2_open(struct ksmbd_work *work)
 		    (FILE_OPEN_BY_FILE_ID_LE | CREATE_TREE_CONNECTION |
 		     FILE_RESERVE_OPFILTER_LE)) {
 			rc = -EOPNOTSUPP;
-			goto err_out1;
+			goto err_out2;
 		}
 
 		if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
 			if (req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE) {
 				rc = -EINVAL;
-				goto err_out1;
+				goto err_out2;
 			} else if (req->CreateOptions & FILE_NO_COMPRESSION_LE) {
 				req->CreateOptions = ~(FILE_NO_COMPRESSION_LE);
 			}
@@ -2778,21 +2772,21 @@ int smb2_open(struct ksmbd_work *work)
 		pr_err("Invalid create disposition : 0x%x\n",
 		       le32_to_cpu(req->CreateDisposition));
 		rc = -EINVAL;
-		goto err_out1;
+		goto err_out2;
 	}
 
 	if (!(req->DesiredAccess & DESIRED_ACCESS_MASK)) {
 		pr_err("Invalid desired access : 0x%x\n",
 		       le32_to_cpu(req->DesiredAccess));
 		rc = -EACCES;
-		goto err_out1;
+		goto err_out2;
 	}
 
 	if (req->FileAttributes && !(req->FileAttributes & FILE_ATTRIBUTE_MASK_LE)) {
 		pr_err("Invalid file attribute : 0x%x\n",
 		       le32_to_cpu(req->FileAttributes));
 		rc = -EINVAL;
-		goto err_out1;
+		goto err_out2;
 	}
 
 	if (req->CreateContextsOffset) {
@@ -2800,19 +2794,19 @@ int smb2_open(struct ksmbd_work *work)
 		context = smb2_find_context_vals(req, SMB2_CREATE_EA_BUFFER, 4);
 		if (IS_ERR(context)) {
 			rc = PTR_ERR(context);
-			goto err_out1;
+			goto err_out2;
 		} else if (context) {
 			ea_buf = (struct create_ea_buf_req *)context;
 			if (le16_to_cpu(context->DataOffset) +
 			    le32_to_cpu(context->DataLength) <
 			    sizeof(struct create_ea_buf_req)) {
 				rc = -EINVAL;
-				goto err_out1;
+				goto err_out2;
 			}
 			if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) {
 				rsp->hdr.Status = STATUS_ACCESS_DENIED;
 				rc = -EACCES;
-				goto err_out1;
+				goto err_out2;
 			}
 		}
 
@@ -2820,7 +2814,7 @@ int smb2_open(struct ksmbd_work *work)
 						 SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST, 4);
 		if (IS_ERR(context)) {
 			rc = PTR_ERR(context);
-			goto err_out1;
+			goto err_out2;
 		} else if (context) {
 			ksmbd_debug(SMB,
 				    "get query maximal access context\n");
@@ -2831,11 +2825,11 @@ int smb2_open(struct ksmbd_work *work)
 						 SMB2_CREATE_TIMEWARP_REQUEST, 4);
 		if (IS_ERR(context)) {
 			rc = PTR_ERR(context);
-			goto err_out1;
+			goto err_out2;
 		} else if (context) {
 			ksmbd_debug(SMB, "get timewarp context\n");
 			rc = -EBADF;
-			goto err_out1;
+			goto err_out2;
 		}
 
 		if (tcon->posix_extensions) {
@@ -2843,7 +2837,7 @@ int smb2_open(struct ksmbd_work *work)
 							 SMB2_CREATE_TAG_POSIX, 16);
 			if (IS_ERR(context)) {
 				rc = PTR_ERR(context);
-				goto err_out1;
+				goto err_out2;
 			} else if (context) {
 				struct create_posix *posix =
 					(struct create_posix *)context;
@@ -2851,7 +2845,7 @@ int smb2_open(struct ksmbd_work *work)
 				    le32_to_cpu(context->DataLength) <
 				    sizeof(struct create_posix) - 4) {
 					rc = -EINVAL;
-					goto err_out1;
+					goto err_out2;
 				}
 				ksmbd_debug(SMB, "get posix context\n");
 
@@ -2863,7 +2857,7 @@ int smb2_open(struct ksmbd_work *work)
 
 	if (ksmbd_override_fsids(work)) {
 		rc = -ENOMEM;
-		goto err_out1;
+		goto err_out2;
 	}
 
 	rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS,
@@ -3038,7 +3032,7 @@ int smb2_open(struct ksmbd_work *work)
 		}
 	}
 
-	rc = ksmbd_query_inode_status(d_inode(path.dentry->d_parent));
+	rc = ksmbd_query_inode_status(path.dentry->d_parent);
 	if (rc == KSMBD_INODE_STATUS_PENDING_DELETE) {
 		rc = -EBUSY;
 		goto err_out;
@@ -3152,7 +3146,8 @@ int smb2_open(struct ksmbd_work *work)
 								    idmap,
 								    &path,
 								    pntsd,
-								    pntsd_size);
+								    pntsd_size,
+								    false);
 					kfree(pntsd);
 					if (rc)
 						pr_err("failed to store ntacl in xattr : %d\n",
@@ -3175,11 +3170,6 @@ int smb2_open(struct ksmbd_work *work)
 
 	fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE |
 			FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE));
-	if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
-	    !fp->attrib_only && !stream_name) {
-		smb_break_all_oplock(work, fp);
-		need_truncate = 1;
-	}
 
 	/* fp should be searchable through ksmbd_inode.m_fp_list
 	 * after daccess, saccess, attrib_only, and stream are
@@ -3195,23 +3185,43 @@ int smb2_open(struct ksmbd_work *work)
 		goto err_out;
 	}
 
+	if (file_present || created)
+		ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
+	if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
+	    !fp->attrib_only && !stream_name) {
+		smb_break_all_oplock(work, fp);
+		need_truncate = 1;
+	}
+
+	req_op_level = req->RequestedOplockLevel;
+	if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+		lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode));
+
 	share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
 	if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
 	    (req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
 	     !(conn->vals->capabilities & SMB2_GLOBAL_CAP_LEASING))) {
 		if (share_ret < 0 && !S_ISDIR(file_inode(fp->filp)->i_mode)) {
 			rc = share_ret;
-			goto err_out;
+			goto err_out1;
 		}
 	} else {
 		if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+			/*
+			 * Compare parent lease using parent key. If there is no
+			 * a lease that has same parent key, Send lease break
+			 * notification.
+			 */
+			smb_send_parent_lease_break_noti(fp, lc);
+
 			req_op_level = smb2_map_lease_to_oplock(lc->req_state);
 			ksmbd_debug(SMB,
 				    "lease req for(%s) req oplock state 0x%x, lease state 0x%x\n",
 				    name, req_op_level, lc->req_state);
 			rc = find_same_lease_key(sess, fp->f_ci, lc);
 			if (rc)
-				goto err_out;
+				goto err_out1;
 		} else if (open_flags == O_RDONLY &&
 			   (req_op_level == SMB2_OPLOCK_LEVEL_BATCH ||
 			    req_op_level == SMB2_OPLOCK_LEVEL_EXCLUSIVE))
@@ -3222,16 +3232,16 @@ int smb2_open(struct ksmbd_work *work)
 				      le32_to_cpu(req->hdr.Id.SyncId.TreeId),
 				      lc, share_ret);
 		if (rc < 0)
-			goto err_out;
+			goto err_out1;
 	}
 
 	if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)
 		ksmbd_fd_set_delete_on_close(fp, file_info);
 
 	if (need_truncate) {
-		rc = smb2_create_truncate(&path);
+		rc = smb2_create_truncate(&fp->filp->f_path);
 		if (rc)
-			goto err_out;
+			goto err_out1;
 	}
 
 	if (req->CreateContextsOffset) {
@@ -3241,7 +3251,7 @@ int smb2_open(struct ksmbd_work *work)
 					SMB2_CREATE_ALLOCATION_SIZE, 4);
 		if (IS_ERR(az_req)) {
 			rc = PTR_ERR(az_req);
-			goto err_out;
+			goto err_out1;
 		} else if (az_req) {
 			loff_t alloc_size;
 			int err;
@@ -3250,7 +3260,7 @@ int smb2_open(struct ksmbd_work *work)
 			    le32_to_cpu(az_req->ccontext.DataLength) <
 			    sizeof(struct create_alloc_size_req)) {
 				rc = -EINVAL;
-				goto err_out;
+				goto err_out1;
 			}
 			alloc_size = le64_to_cpu(az_req->AllocationSize);
 			ksmbd_debug(SMB,
@@ -3268,7 +3278,7 @@ int smb2_open(struct ksmbd_work *work)
 		context = smb2_find_context_vals(req, SMB2_CREATE_QUERY_ON_DISK_ID, 4);
 		if (IS_ERR(context)) {
 			rc = PTR_ERR(context);
-			goto err_out;
+			goto err_out1;
 		} else if (context) {
 			ksmbd_debug(SMB, "get query on disk id context\n");
 			query_disk_id = 1;
@@ -3277,7 +3287,7 @@ int smb2_open(struct ksmbd_work *work)
 
 	rc = ksmbd_vfs_getattr(&path, &stat);
 	if (rc)
-		goto err_out;
+		goto err_out1;
 
 	if (stat.result_mask & STATX_BTIME)
 		fp->create_time = ksmbd_UnixTimeToNT(stat.btime);
@@ -3398,13 +3408,13 @@ int smb2_open(struct ksmbd_work *work)
 	}
 
 err_out:
-	if (file_present || created) {
-		inode_unlock(d_inode(parent_path.dentry));
-		path_put(&path);
-		path_put(&parent_path);
-	}
-	ksmbd_revert_fsids(work);
+	if (rc && (file_present || created))
+		ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
 err_out1:
+	ksmbd_revert_fsids(work);
+
+err_out2:
 	if (!rc) {
 		ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED);
 		rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
@@ -4834,9 +4844,9 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
 
 	file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
 	file_info->CreationTime = cpu_to_le64(fp->create_time);
-	time = ksmbd_UnixTimeToNT(inode->i_atime);
+	time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
 	file_info->LastAccessTime = cpu_to_le64(time);
-	time = ksmbd_UnixTimeToNT(inode->i_mtime);
+	time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
 	file_info->LastWriteTime = cpu_to_le64(time);
 	time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
 	file_info->ChangeTime = cpu_to_le64(time);
@@ -5443,9 +5453,9 @@ int smb2_close(struct ksmbd_work *work)
 		rsp->EndOfFile = cpu_to_le64(inode->i_size);
 		rsp->Attributes = fp->f_ci->m_fattr;
 		rsp->CreationTime = cpu_to_le64(fp->create_time);
-		time = ksmbd_UnixTimeToNT(inode->i_atime);
+		time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
 		rsp->LastAccessTime = cpu_to_le64(time);
-		time = ksmbd_UnixTimeToNT(inode->i_mtime);
+		time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
 		rsp->LastWriteTime = cpu_to_le64(time);
 		time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
 		rsp->ChangeTime = cpu_to_le64(time);
@@ -5537,7 +5547,7 @@ static int smb2_rename(struct ksmbd_work *work,
 		rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp),
 					&fp->filp->f_path,
 					xattr_stream_name,
-					NULL, 0, 0);
+					NULL, 0, 0, true);
 		if (rc < 0) {
 			pr_err("failed to store stream name in xattr: %d\n",
 			       rc);
@@ -5630,11 +5640,9 @@ static int smb2_create_link(struct ksmbd_work *work,
 	if (rc)
 		rc = -EINVAL;
 out:
-	if (file_present) {
-		inode_unlock(d_inode(parent_path.dentry));
-		path_put(&path);
-		path_put(&parent_path);
-	}
+	if (file_present)
+		ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+
 	if (!IS_ERR(link_name))
 		kfree(link_name);
 	kfree(pathname);
@@ -5701,7 +5709,8 @@ static int set_file_basic_info(struct ksmbd_file *fp,
 		da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME |
 			XATTR_DOSINFO_ITIME;
 
-		rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da);
+		rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da,
+				true);
 		if (rc)
 			ksmbd_debug(SMB,
 				    "failed to restore file attribute in EA\n");
@@ -6013,7 +6022,7 @@ static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info,
 	fp->saccess |= FILE_SHARE_DELETE_LE;
 
 	return set_info_sec(fp->conn, fp->tcon, &fp->filp->f_path, pntsd,
-			buf_len, false);
+			buf_len, false, true);
 }
 
 /**
@@ -7078,6 +7087,7 @@ skip:
 						      smb2_remove_blocked_lock,
 						      argv);
 				if (rc) {
+					kfree(argv);
 					err = -ENOMEM;
 					goto out;
 				}
@@ -7582,7 +7592,8 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id,
 
 		da.attr = le32_to_cpu(fp->f_ci->m_fattr);
 		ret = ksmbd_vfs_set_dos_attrib_xattr(idmap,
-						     &fp->filp->f_path, &da);
+						     &fp->filp->f_path,
+						     &da, true);
 		if (ret)
 			fp->f_ci->m_fattr = old_fattr;
 	}
@@ -8208,6 +8219,11 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
 			    le32_to_cpu(req->LeaseState));
 	}
 
+	if (ret < 0) {
+		rsp->hdr.Status = err;
+		goto err_out;
+	}
+
 	lease_state = lease->state;
 	opinfo->op_state = OPLOCK_STATE_NONE;
 	wake_up_interruptible_all(&opinfo->oplock_q);
@@ -8215,11 +8231,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
 	wake_up_interruptible_all(&opinfo->oplock_brk);
 	opinfo_put(opinfo);
 
-	if (ret < 0) {
-		rsp->hdr.Status = err;
-		goto err_out;
-	}
-
 	rsp->StructureSize = cpu_to_le16(36);
 	rsp->Reserved = 0;
 	rsp->Flags = 0;
@@ -8231,7 +8242,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
 		return;
 
 err_out:
-	opinfo->op_state = OPLOCK_STATE_NONE;
 	wake_up_interruptible_all(&opinfo->oplock_q);
 	atomic_dec(&opinfo->breaking_cnt);
 	wake_up_interruptible_all(&opinfo->oplock_brk);
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index e6ba1e9b8589..6691ae68af0c 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -366,11 +366,22 @@ static int smb1_allocate_rsp_buf(struct ksmbd_work *work)
 	return 0;
 }
 
+/**
+ * set_smb1_rsp_status() - set error type in smb response header
+ * @work:	smb work containing smb response header
+ * @err:	error code to set in response
+ */
+static void set_smb1_rsp_status(struct ksmbd_work *work, __le32 err)
+{
+	work->send_no_response = 1;
+}
+
 static struct smb_version_ops smb1_server_ops = {
 	.get_cmd_val = get_smb1_cmd_val,
 	.init_rsp_hdr = init_smb1_rsp_hdr,
 	.allocate_rsp_buf = smb1_allocate_rsp_buf,
 	.check_user_session = smb1_check_user_session,
+	.set_rsp_status = set_smb1_rsp_status,
 };
 
 static int smb1_negotiate(struct ksmbd_work *work)
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 6c0305be895e..1164365533f0 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1107,6 +1107,7 @@ pass:
 		struct smb_acl *pdacl;
 		struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL;
 		int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size;
+		int pntsd_alloc_size;
 
 		if (parent_pntsd->osidoffset) {
 			powner_sid = (struct smb_sid *)((char *)parent_pntsd +
@@ -1119,9 +1120,10 @@ pass:
 			pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4);
 		}
 
-		pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size +
-				pgroup_sid_size + sizeof(struct smb_acl) +
-				nt_size, GFP_KERNEL);
+		pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size +
+			pgroup_sid_size + sizeof(struct smb_acl) + nt_size;
+
+		pntsd = kzalloc(pntsd_alloc_size, GFP_KERNEL);
 		if (!pntsd) {
 			rc = -ENOMEM;
 			goto free_aces_base;
@@ -1136,6 +1138,27 @@ pass:
 		pntsd->gsidoffset = parent_pntsd->gsidoffset;
 		pntsd->dacloffset = parent_pntsd->dacloffset;
 
+		if ((u64)le32_to_cpu(pntsd->osidoffset) + powner_sid_size >
+		    pntsd_alloc_size) {
+			rc = -EINVAL;
+			kfree(pntsd);
+			goto free_aces_base;
+		}
+
+		if ((u64)le32_to_cpu(pntsd->gsidoffset) + pgroup_sid_size >
+		    pntsd_alloc_size) {
+			rc = -EINVAL;
+			kfree(pntsd);
+			goto free_aces_base;
+		}
+
+		if ((u64)le32_to_cpu(pntsd->dacloffset) + sizeof(struct smb_acl) + nt_size >
+		    pntsd_alloc_size) {
+			rc = -EINVAL;
+			kfree(pntsd);
+			goto free_aces_base;
+		}
+
 		if (pntsd->osidoffset) {
 			struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd +
 					le32_to_cpu(pntsd->osidoffset));
@@ -1162,7 +1185,7 @@ pass:
 			pntsd_size += sizeof(struct smb_acl) + nt_size;
 		}
 
-		ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size);
+		ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size, false);
 		kfree(pntsd);
 	}
 
@@ -1354,7 +1377,7 @@ err_out:
 
 int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 		 const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
-		 bool type_check)
+		 bool type_check, bool get_write)
 {
 	int rc;
 	struct smb_fattr fattr = {{0}};
@@ -1414,7 +1437,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 	if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) {
 		/* Update WinACL in xattr */
 		ksmbd_vfs_remove_sd_xattrs(idmap, path);
-		ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len);
+		ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len,
+				get_write);
 	}
 
 out:
diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h
index 49a8c292bd2e..2b52861707d8 100644
--- a/fs/smb/server/smbacl.h
+++ b/fs/smb/server/smbacl.h
@@ -207,7 +207,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
 			__le32 *pdaccess, int uid);
 int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
 		 const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
-		 bool type_check);
+		 bool type_check, bool get_write);
 void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
 void ksmbd_init_domain(u32 *sub_auth);
 
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 3b269e1f523a..c5629a68c8b7 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2140,8 +2140,7 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev)
 	if (ib_dev->node_type != RDMA_NODE_IB_CA)
 		smb_direct_port = SMB_DIRECT_PORT_IWARP;
 
-	if (!ib_dev->ops.get_netdev ||
-	    !rdma_frwr_is_supported(&ib_dev->attrs))
+	if (!rdma_frwr_is_supported(&ib_dev->attrs))
 		return 0;
 
 	smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL);
@@ -2241,17 +2240,38 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
 		for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) {
 			struct net_device *ndev;
 
-			ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev,
-							       i + 1);
-			if (!ndev)
-				continue;
+			if (smb_dev->ib_dev->ops.get_netdev) {
+				ndev = smb_dev->ib_dev->ops.get_netdev(
+					smb_dev->ib_dev, i + 1);
+				if (!ndev)
+					continue;
 
-			if (ndev == netdev) {
+				if (ndev == netdev) {
+					dev_put(ndev);
+					rdma_capable = true;
+					goto out;
+				}
 				dev_put(ndev);
-				rdma_capable = true;
-				goto out;
+			/* if ib_dev does not implement ops.get_netdev
+			 * check for matching infiniband GUID in hw_addr
+			 */
+			} else if (netdev->type == ARPHRD_INFINIBAND) {
+				struct netdev_hw_addr *ha;
+				union ib_gid gid;
+				u32 port_num;
+				int ret;
+
+				netdev_hw_addr_list_for_each(
+					ha, &netdev->dev_addrs) {
+					memcpy(&gid, ha->addr + 4, sizeof(gid));
+					ret = ib_find_gid(smb_dev->ib_dev, &gid,
+							  &port_num, NULL);
+					if (!ret) {
+						rdma_capable = true;
+						goto out;
+					}
+				}
 			}
-			dev_put(ndev);
 		}
 	}
 out:
diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c
index 393dd4a7432b..43ed29ee44ea 100644
--- a/fs/smb/server/unicode.c
+++ b/fs/smb/server/unicode.c
@@ -14,45 +14,9 @@
 #include "smb_common.h"
 
 /*
- * smb_utf16_bytes() - how long will a string be after conversion?
- * @from:	pointer to input string
- * @maxbytes:	don't go past this many bytes of input string
- * @codepage:	destination codepage
- *
- * Walk a utf16le string and return the number of bytes that the string will
- * be after being converted to the given charset, not including any null
- * termination required. Don't walk past maxbytes in the source buffer.
- *
- * Return:	string length after conversion
- */
-static int smb_utf16_bytes(const __le16 *from, int maxbytes,
-			   const struct nls_table *codepage)
-{
-	int i;
-	int charlen, outlen = 0;
-	int maxwords = maxbytes / 2;
-	char tmp[NLS_MAX_CHARSET_SIZE];
-	__u16 ftmp;
-
-	for (i = 0; i < maxwords; i++) {
-		ftmp = get_unaligned_le16(&from[i]);
-		if (ftmp == 0)
-			break;
-
-		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
-		if (charlen > 0)
-			outlen += charlen;
-		else
-			outlen++;
-	}
-
-	return outlen;
-}
-
-/*
  * cifs_mapchar() - convert a host-endian char to proper char in codepage
  * @target:	where converted character should be copied
- * @src_char:	2 byte host-endian source character
+ * @from:	host-endian source string
  * @cp:		codepage to which character should be converted
  * @mapchar:	should character be mapped according to mapchars mount option?
  *
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
  * Return:	string length after conversion
  */
 static int
-cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
 	     bool mapchar)
 {
 	int len = 1;
+	__u16 src_char;
+
+	src_char = *from;
 
 	if (!mapchar)
 		goto cp_convert;
@@ -104,12 +71,66 @@ out:
 
 cp_convert:
 	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-	if (len <= 0) {
-		*target = '?';
-		len = 1;
-	}
+	if (len <= 0)
+		goto surrogate_pair;
 
 	goto out;
+
+surrogate_pair:
+	/* convert SURROGATE_PAIR and IVS */
+	if (strcmp(cp->charset, "utf8"))
+		goto unknown;
+	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+	if (len <= 0)
+		goto unknown;
+	return len;
+
+unknown:
+	*target = '?';
+	len = 1;
+	goto out;
+}
+
+/*
+ * smb_utf16_bytes() - compute converted string length
+ * @from:	pointer to input string
+ * @maxbytes:	input string length
+ * @codepage:	destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ *
+ * Return:	string length after conversion
+ */
+static int smb_utf16_bytes(const __le16 *from, int maxbytes,
+			   const struct nls_table *codepage)
+{
+	int i, j;
+	int charlen, outlen = 0;
+	int maxwords = maxbytes / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+	__u16 ftmp[3];
+
+	for (i = 0; i < maxwords; i++) {
+		ftmp[0] = get_unaligned_le16(&from[i]);
+		if (ftmp[0] == 0)
+			break;
+		for (j = 1; j <= 2; j++) {
+			if (i + j < maxwords)
+				ftmp[j] = get_unaligned_le16(&from[i + j]);
+			else
+				ftmp[j] = 0;
+		}
+
+		charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
+		if (charlen > 0)
+			outlen += charlen;
+		else
+			outlen++;
+	}
+
+	return outlen;
 }
 
 /*
@@ -139,12 +160,12 @@ cp_convert:
 static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 			  const struct nls_table *codepage, bool mapchar)
 {
-	int i, charlen, safelen;
+	int i, j, charlen, safelen;
 	int outlen = 0;
 	int nullsize = nls_nullsize(codepage);
 	int fromwords = fromlen / 2;
 	char tmp[NLS_MAX_CHARSET_SIZE];
-	__u16 ftmp;
+	__u16 ftmp[3];	/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
 
 	/*
 	 * because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
 
 	for (i = 0; i < fromwords; i++) {
-		ftmp = get_unaligned_le16(&from[i]);
-		if (ftmp == 0)
+		ftmp[0] = get_unaligned_le16(&from[i]);
+		if (ftmp[0] == 0)
 			break;
+		for (j = 1; j <= 2; j++) {
+			if (i + j < fromwords)
+				ftmp[j] = get_unaligned_le16(&from[i + j]);
+			else
+				ftmp[j] = 0;
+		}
 
 		/*
 		 * check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 		/* put converted char into 'to' buffer */
 		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
 		outlen += charlen;
+
+		/*
+		 * charlen (=bytes of UTF-8 for 1 character)
+		 * 4bytes UTF-8(surrogate pair) is charlen=4
+		 * (4bytes UTF-16 code)
+		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
+		 * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
+		 */
+		if (charlen == 4)
+			i++;
+		else if (charlen >= 5)
+			/* 5-6bytes UTF-8 */
+			i += 2;
 	}
 
 	/* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
 	char src_char;
 	__le16 dst_char;
 	wchar_t tmp;
+	wchar_t wchar_to[6];	/* UTF-16 */
+	int ret;
+	unicode_t u;
 
 	if (!mapchars)
 		return smb_strtoUTF16(target, source, srclen, cp);
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
 			 * if no match, use question mark, which at least in
 			 * some cases serves as wild card
 			 */
-			if (charlen < 1) {
-				dst_char = cpu_to_le16(0x003f);
-				charlen = 1;
+			if (charlen > 0)
+				goto ctoUTF16;
+
+			/* convert SURROGATE_PAIR */
+			if (strcmp(cp->charset, "utf8"))
+				goto unknown;
+			if (*(source + i) & 0x80) {
+				charlen = utf8_to_utf32(source + i, 6, &u);
+				if (charlen < 0)
+					goto unknown;
+			} else
+				goto unknown;
+			ret  = utf8s_to_utf16s(source + i, charlen,
+					UTF16_LITTLE_ENDIAN,
+					wchar_to, 6);
+			if (ret < 0)
+				goto unknown;
+
+			i += charlen;
+			dst_char = cpu_to_le16(*wchar_to);
+			if (charlen <= 3)
+				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
+				put_unaligned(dst_char, &target[j]);
+			else if (charlen == 4) {
+				/*
+				 * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
+				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
+				 * (charlen=3+4 or 4+4)
+				 */
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 1));
+				j++;
+				put_unaligned(dst_char, &target[j]);
+			} else if (charlen >= 5) {
+				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 1));
+				j++;
+				put_unaligned(dst_char, &target[j]);
+				dst_char = cpu_to_le16(*(wchar_to + 2));
+				j++;
+				put_unaligned(dst_char, &target[j]);
 			}
+			continue;
+
+unknown:
+			dst_char = cpu_to_le16(0x003f);
+			charlen = 1;
 		}
+
+ctoUTF16:
 		/*
 		 * character may take more than one byte in the source string,
 		 * but will take exactly two bytes in the target string
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index b5a5e50fc9ca..4277750a6da1 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -97,6 +97,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
 		return -ENOENT;
 	}
 
+	err = mnt_want_write(parent_path->mnt);
+	if (err) {
+		path_put(parent_path);
+		putname(filename);
+		return -ENOENT;
+	}
+
 	inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT);
 	d = lookup_one_qstr_excl(&last, parent_path->dentry, 0);
 	if (IS_ERR(d))
@@ -123,6 +130,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
 
 err_out:
 	inode_unlock(d_inode(parent_path->dentry));
+	mnt_drop_write(parent_path->mnt);
 	path_put(parent_path);
 	putname(filename);
 	return -ENOENT;
@@ -173,10 +181,6 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 		return err;
 	}
 
-	err = mnt_want_write(path.mnt);
-	if (err)
-		goto out_err;
-
 	mode |= S_IFREG;
 	err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
 			 dentry, mode, true);
@@ -186,9 +190,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
 	} else {
 		pr_err("File(%s): creation failed (err:%d)\n", name, err);
 	}
-	mnt_drop_write(path.mnt);
 
-out_err:
 	done_path_create(&path, dentry);
 	return err;
 }
@@ -219,10 +221,6 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 		return err;
 	}
 
-	err = mnt_want_write(path.mnt);
-	if (err)
-		goto out_err2;
-
 	idmap = mnt_idmap(path.mnt);
 	mode |= S_IFDIR;
 	err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
@@ -233,21 +231,19 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
 			       dentry->d_name.len);
 		if (IS_ERR(d)) {
 			err = PTR_ERR(d);
-			goto out_err1;
+			goto out_err;
 		}
 		if (unlikely(d_is_negative(d))) {
 			dput(d);
 			err = -ENOENT;
-			goto out_err1;
+			goto out_err;
 		}
 
 		ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
 		dput(d);
 	}
 
-out_err1:
-	mnt_drop_write(path.mnt);
-out_err2:
+out_err:
 	done_path_create(&path, dentry);
 	if (err)
 		pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -463,7 +459,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
 				 fp->stream.name,
 				 (void *)stream_buf,
 				 size,
-				 0);
+				 0,
+				 true);
 	if (err < 0)
 		goto out;
 
@@ -520,6 +517,9 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
 		}
 	}
 
+	/* Reserve lease break for parent dir at closing time */
+	fp->reserve_lease_break = true;
+
 	/* Do we need to break any of a levelII oplock? */
 	smb_break_all_levII_oplock(work, fp, 1);
 
@@ -605,10 +605,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
 		goto out_err;
 	}
 
-	err = mnt_want_write(path->mnt);
-	if (err)
-		goto out_err;
-
 	idmap = mnt_idmap(path->mnt);
 	if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
 		err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
@@ -619,7 +615,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
 		if (err)
 			ksmbd_debug(VFS, "unlink failed, err %d\n", err);
 	}
-	mnt_drop_write(path->mnt);
 
 out_err:
 	ksmbd_revert_fsids(work);
@@ -665,16 +660,11 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname,
 		goto out3;
 	}
 
-	err = mnt_want_write(newpath.mnt);
-	if (err)
-		goto out3;
-
 	err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt),
 		       d_inode(newpath.dentry),
 		       dentry, NULL);
 	if (err)
 		ksmbd_debug(VFS, "vfs_link failed err %d\n", err);
-	mnt_drop_write(newpath.mnt);
 
 out3:
 	done_path_create(&newpath, dentry);
@@ -732,7 +722,7 @@ retry:
 		goto out3;
 	}
 
-	parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent));
+	parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent);
 	if (parent_fp) {
 		if (parent_fp->daccess & FILE_DELETE_LE) {
 			pr_err("parent dir is opened with delete access\n");
@@ -919,23 +909,27 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap,
 /**
  * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value
  * @idmap:	idmap of the relevant mount
- * @dentry:	dentry to set XATTR at
+ * @path:	path of dentry to set XATTR at
  * @attr_name:	xattr name for setxattr
  * @attr_value:	xattr value to set
  * @attr_size:	size of xattr value
  * @flags:	destination buffer length
+ * @get_write:	get write access to a mount
  *
  * Return:	0 on success, otherwise error
  */
 int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
 		       const struct path *path, const char *attr_name,
-		       void *attr_value, size_t attr_size, int flags)
+		       void *attr_value, size_t attr_size, int flags,
+		       bool get_write)
 {
 	int err;
 
-	err = mnt_want_write(path->mnt);
-	if (err)
-		return err;
+	if (get_write == true) {
+		err = mnt_want_write(path->mnt);
+		if (err)
+			return err;
+	}
 
 	err = vfs_setxattr(idmap,
 			   path->dentry,
@@ -945,7 +939,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
 			   flags);
 	if (err)
 		ksmbd_debug(VFS, "setxattr failed, err %d\n", err);
-	mnt_drop_write(path->mnt);
+	if (get_write == true)
+		mnt_drop_write(path->mnt);
 	return err;
 }
 
@@ -1194,9 +1189,10 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
 
 /**
  * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
- * @name:	file path that is relative to share
- * @flags:	lookup flags
- * @path:	if lookup succeed, return path info
+ * @name:		file path that is relative to share
+ * @flags:		lookup flags
+ * @parent_path:	if lookup succeed, return parent_path info
+ * @path:		if lookup succeed, return path info
  * @caseless:	caseless filename lookup
  *
  * Return:	0 on success, otherwise error
@@ -1268,6 +1264,13 @@ out1:
 	}
 
 	if (!err) {
+		err = mnt_want_write(parent_path->mnt);
+		if (err) {
+			path_put(path);
+			path_put(parent_path);
+			return err;
+		}
+
 		err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
 		if (err) {
 			path_put(path);
@@ -1277,6 +1280,14 @@ out1:
 	return err;
 }
 
+void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path)
+{
+	inode_unlock(d_inode(parent_path->dentry));
+	mnt_drop_write(parent_path->mnt);
+	path_put(path);
+	path_put(parent_path);
+}
+
 struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 					  const char *name,
 					  unsigned int flags,
@@ -1431,7 +1442,8 @@ out:
 int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
 			   struct mnt_idmap *idmap,
 			   const struct path *path,
-			   struct smb_ntsd *pntsd, int len)
+			   struct smb_ntsd *pntsd, int len,
+			   bool get_write)
 {
 	int rc;
 	struct ndr sd_ndr = {0}, acl_ndr = {0};
@@ -1491,7 +1503,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
 
 	rc = ksmbd_vfs_setxattr(idmap, path,
 				XATTR_NAME_SD, sd_ndr.data,
-				sd_ndr.offset, 0);
+				sd_ndr.offset, 0, get_write);
 	if (rc < 0)
 		pr_err("Failed to store XATTR ntacl :%d\n", rc);
 
@@ -1580,7 +1592,8 @@ free_n_data:
 
 int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
 				   const struct path *path,
-				   struct xattr_dos_attrib *da)
+				   struct xattr_dos_attrib *da,
+				   bool get_write)
 {
 	struct ndr n;
 	int err;
@@ -1590,7 +1603,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
 		return err;
 
 	err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE,
-				 (void *)n.data, n.offset, 0);
+				 (void *)n.data, n.offset, 0, get_write);
 	if (err)
 		ksmbd_debug(SMB, "failed to store dos attribute in xattr\n");
 	kfree(n.data);
@@ -1862,10 +1875,6 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
 	}
 	posix_state_to_acl(&acl_state, acls->a_entries);
 
-	rc = mnt_want_write(path->mnt);
-	if (rc)
-		goto out_err;
-
 	rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
 	if (rc < 0)
 		ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1877,9 +1886,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
 			ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
 				    rc);
 	}
-	mnt_drop_write(path->mnt);
 
-out_err:
 	free_acl_state(&acl_state);
 	posix_acl_release(acls);
 	return rc;
@@ -1909,10 +1916,6 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
 		}
 	}
 
-	rc = mnt_want_write(path->mnt);
-	if (rc)
-		goto out_err;
-
 	rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls);
 	if (rc < 0)
 		ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
@@ -1924,9 +1927,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
 			ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
 				    rc);
 	}
-	mnt_drop_write(path->mnt);
 
-out_err:
 	posix_acl_release(acls);
 	return rc;
 }
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 00968081856e..cfe1c8092f23 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -109,7 +109,8 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap,
 				int attr_name_len);
 int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
 		       const struct path *path, const char *attr_name,
-		       void *attr_value, size_t attr_size, int flags);
+		       void *attr_value, size_t attr_size, int flags,
+		       bool get_write);
 int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
 				size_t *xattr_stream_name_size, int s_type);
 int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
@@ -117,6 +118,7 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
 int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
 			       unsigned int flags, struct path *parent_path,
 			       struct path *path, bool caseless);
+void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path);
 struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 					  const char *name,
 					  unsigned int flags,
@@ -144,14 +146,16 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
 int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
 			   struct mnt_idmap *idmap,
 			   const struct path *path,
-			   struct smb_ntsd *pntsd, int len);
+			   struct smb_ntsd *pntsd, int len,
+			   bool get_write);
 int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
 			   struct mnt_idmap *idmap,
 			   struct dentry *dentry,
 			   struct smb_ntsd **pntsd);
 int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap,
 				   const struct path *path,
-				   struct xattr_dos_attrib *da);
+				   struct xattr_dos_attrib *da,
+				   bool get_write);
 int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
 				   struct dentry *dentry,
 				   struct xattr_dos_attrib *da);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index c91eac6514dd..4e82ff627d12 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -66,14 +66,14 @@ static unsigned long inode_hash(struct super_block *sb, unsigned long hashval)
 	return tmp & inode_hash_mask;
 }
 
-static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode)
+static struct ksmbd_inode *__ksmbd_inode_lookup(struct dentry *de)
 {
 	struct hlist_head *head = inode_hashtable +
-		inode_hash(inode->i_sb, inode->i_ino);
+		inode_hash(d_inode(de)->i_sb, (unsigned long)de);
 	struct ksmbd_inode *ci = NULL, *ret_ci = NULL;
 
 	hlist_for_each_entry(ci, head, m_hash) {
-		if (ci->m_inode == inode) {
+		if (ci->m_de == de) {
 			if (atomic_inc_not_zero(&ci->m_count))
 				ret_ci = ci;
 			break;
@@ -84,26 +84,27 @@ static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode)
 
 static struct ksmbd_inode *ksmbd_inode_lookup(struct ksmbd_file *fp)
 {
-	return __ksmbd_inode_lookup(file_inode(fp->filp));
+	return __ksmbd_inode_lookup(fp->filp->f_path.dentry);
 }
 
-static struct ksmbd_inode *ksmbd_inode_lookup_by_vfsinode(struct inode *inode)
+struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d)
 {
 	struct ksmbd_inode *ci;
 
 	read_lock(&inode_hash_lock);
-	ci = __ksmbd_inode_lookup(inode);
+	ci = __ksmbd_inode_lookup(d);
 	read_unlock(&inode_hash_lock);
+
 	return ci;
 }
 
-int ksmbd_query_inode_status(struct inode *inode)
+int ksmbd_query_inode_status(struct dentry *dentry)
 {
 	struct ksmbd_inode *ci;
 	int ret = KSMBD_INODE_STATUS_UNKNOWN;
 
 	read_lock(&inode_hash_lock);
-	ci = __ksmbd_inode_lookup(inode);
+	ci = __ksmbd_inode_lookup(dentry);
 	if (ci) {
 		ret = KSMBD_INODE_STATUS_OK;
 		if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS))
@@ -143,7 +144,7 @@ void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
 static void ksmbd_inode_hash(struct ksmbd_inode *ci)
 {
 	struct hlist_head *b = inode_hashtable +
-		inode_hash(ci->m_inode->i_sb, ci->m_inode->i_ino);
+		inode_hash(d_inode(ci->m_de)->i_sb, (unsigned long)ci->m_de);
 
 	hlist_add_head(&ci->m_hash, b);
 }
@@ -157,7 +158,6 @@ static void ksmbd_inode_unhash(struct ksmbd_inode *ci)
 
 static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
 {
-	ci->m_inode = file_inode(fp->filp);
 	atomic_set(&ci->m_count, 1);
 	atomic_set(&ci->op_count, 0);
 	atomic_set(&ci->sop_count, 0);
@@ -166,6 +166,7 @@ static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp)
 	INIT_LIST_HEAD(&ci->m_fp_list);
 	INIT_LIST_HEAD(&ci->m_op_list);
 	rwlock_init(&ci->m_lock);
+	ci->m_de = fp->filp->f_path.dentry;
 	return 0;
 }
 
@@ -209,7 +210,7 @@ static void ksmbd_inode_free(struct ksmbd_inode *ci)
 	kfree(ci);
 }
 
-static void ksmbd_inode_put(struct ksmbd_inode *ci)
+void ksmbd_inode_put(struct ksmbd_inode *ci)
 {
 	if (atomic_dec_and_test(&ci->m_count))
 		ksmbd_inode_free(ci);
@@ -488,12 +489,15 @@ struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
 	return fp;
 }
 
-struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry)
 {
 	struct ksmbd_file	*lfp;
 	struct ksmbd_inode	*ci;
+	struct inode		*inode = d_inode(dentry);
 
-	ci = ksmbd_inode_lookup_by_vfsinode(inode);
+	read_lock(&inode_hash_lock);
+	ci = __ksmbd_inode_lookup(dentry);
+	read_unlock(&inode_hash_lock);
 	if (!ci)
 		return NULL;
 
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 03d0bf941216..a528f0cc775a 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -51,7 +51,7 @@ struct ksmbd_inode {
 	atomic_t			op_count;
 	/* opinfo count for streams */
 	atomic_t			sop_count;
-	struct inode			*m_inode;
+	struct dentry			*m_de;
 	unsigned int			m_flags;
 	struct hlist_node		m_hash;
 	struct list_head		m_fp_list;
@@ -105,6 +105,7 @@ struct ksmbd_file {
 	struct ksmbd_readdir_data	readdir_data;
 	int				dot_dotdot[2];
 	unsigned int			f_state;
+	bool				reserve_lease_break;
 };
 
 static inline void set_ctx_actor(struct dir_context *ctx,
@@ -138,9 +139,11 @@ struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id);
 struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
 					u64 pid);
 void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
+struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d);
+void ksmbd_inode_put(struct ksmbd_inode *ci);
 struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
 struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
-struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode);
+struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry);
 unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
 struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp);
 void ksmbd_close_tree_conn_fds(struct ksmbd_work *work);
@@ -164,7 +167,7 @@ enum KSMBD_INODE_STATUS {
 	KSMBD_INODE_STATUS_PENDING_DELETE,
 };
 
-int ksmbd_query_inode_status(struct inode *inode);
+int ksmbd_query_inode_status(struct dentry *dentry);
 bool ksmbd_inode_pending_delete(struct ksmbd_file *fp);
 void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
 void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 581ce9519339..2dc730800f44 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -321,7 +321,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
 		TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2,
 		      compressed ? "" : "un", length);
 	}
-	if (length < 0 || length > output->length ||
+	if (length <= 0 || length > output->length ||
 			(index + length) > msblk->bytes_used) {
 		res = -EIO;
 		goto out;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 723763746238..62972f0ff868 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
 
 
 const struct export_operations squashfs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry = squashfs_fh_to_dentry,
 	.fh_to_parent = squashfs_fh_to_parent,
 	.get_parent = squashfs_get_parent
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index c6e626b00546..aa3411354e66 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
 	i_uid_write(inode, i_uid);
 	i_gid_write(inode, i_gid);
 	inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
-	inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
-	inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
-	inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
+	inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
+	inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
+	inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
 	inode->i_mode = le16_to_cpu(sqsh_ino->mode);
 	inode->i_size = 0;
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index a6164fdf9435..5a756e6790b5 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops;
 extern const struct inode_operations squashfs_symlink_inode_ops;
 
 /* xattr.c */
-extern const struct xattr_handler *squashfs_xattr_handlers[];
+extern const struct xattr_handler * const squashfs_xattr_handlers[];
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e1e3f3dd5a06..ce6608cabd49 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
 	}
 }
 
-const struct xattr_handler *squashfs_xattr_handlers[] = {
+const struct xattr_handler * const squashfs_xattr_handlers[] = {
 	&squashfs_xattr_user_handler,
 	&squashfs_xattr_trusted_handler,
 	&squashfs_xattr_security_handler,
diff --git a/fs/stack.c b/fs/stack.c
index b5e01bdb5f5f..f18920119944 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -66,8 +66,8 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
 	dest->i_uid = src->i_uid;
 	dest->i_gid = src->i_gid;
 	dest->i_rdev = src->i_rdev;
-	dest->i_atime = src->i_atime;
-	dest->i_mtime = src->i_mtime;
+	inode_set_atime_to_ts(dest, inode_get_atime(src));
+	inode_set_mtime_to_ts(dest, inode_get_mtime(src));
 	inode_set_ctime_to_ts(dest, inode_get_ctime(src));
 	dest->i_blkbits = src->i_blkbits;
 	dest->i_flags = src->i_flags;
diff --git a/fs/stat.c b/fs/stat.c
index d43a5cc1bfa4..f721d26ec3f7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,8 +57,8 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
 	stat->gid = vfsgid_into_kgid(vfsgid);
 	stat->rdev = inode->i_rdev;
 	stat->size = i_size_read(inode);
-	stat->atime = inode->i_atime;
-	stat->mtime = inode->i_mtime;
+	stat->atime = inode_get_atime(inode);
+	stat->mtime = inode_get_mtime(inode);
 	stat->ctime = inode_get_ctime(inode);
 	stat->blksize = i_blocksize(inode);
 	stat->blocks = inode->i_blocks;
@@ -133,7 +133,8 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
 	idmap = mnt_idmap(path->mnt);
 	if (inode->i_op->getattr)
 		return inode->i_op->getattr(idmap, path, stat,
-					    request_mask, query_flags);
+					    request_mask,
+					    query_flags | AT_GETATTR_NOSEC);
 
 	generic_fillattr(idmap, request_mask, inode, stat);
 	return 0;
@@ -166,6 +167,9 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
 {
 	int retval;
 
+	if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
+		return -EPERM;
+
 	retval = security_inode_getattr(path);
 	if (retval)
 		return retval;
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..076392396e72 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag)
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
  * If that happens we could trigger unregistering the shrinker from within the
- * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
+ * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
 static unsigned long super_cache_scan(struct shrinker *shrink,
@@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 	long	dentries;
 	long	inodes;
 
-	sb = container_of(shrink, struct super_block, s_shrink);
+	sb = shrink->private_data;
 
 	/*
 	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
@@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	struct super_block *sb;
 	long	total_objects = 0;
 
-	sb = container_of(shrink, struct super_block, s_shrink);
+	sb = shrink->private_data;
 
 	/*
 	 * We don't call super_trylock_shared() here as it is a scalability
@@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s)
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
-	free_prealloced_shrinker(&s->s_shrink);
+	shrinker_free(s->s_shrink);
 	/* no delays needed */
 	destroy_super_work(&s->destroy_work);
 }
@@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_time_min = TIME64_MIN;
 	s->s_time_max = TIME64_MAX;
 
-	s->s_shrink.seeks = DEFAULT_SEEKS;
-	s->s_shrink.scan_objects = super_cache_scan;
-	s->s_shrink.count_objects = super_cache_count;
-	s->s_shrink.batch = 1024;
-	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
-	if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
+	s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
+				     "sb-%s", type->name);
+	if (!s->s_shrink)
 		goto fail;
-	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+
+	s->s_shrink->scan_objects = super_cache_scan;
+	s->s_shrink->count_objects = super_cache_count;
+	s->s_shrink->batch = 1024;
+	s->s_shrink->private_data = s;
+
+	if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
 		goto fail;
-	if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
 	return s;
 
@@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		unregister_shrinker(&s->s_shrink);
+		shrinker_free(s->s_shrink);
 		fs->kill_sb(s);
 
 		kill_super_notify(s);
@@ -818,7 +821,7 @@ retry:
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(s->s_type);
-	register_shrinker_prepared(&s->s_shrink);
+	shrinker_register(s->s_shrink);
 	return s;
 
 share_extant_sb:
@@ -901,7 +904,7 @@ retry:
 	hlist_add_head(&s->s_instances, &type->fs_supers);
 	spin_unlock(&sb_lock);
 	get_filesystem(type);
-	register_shrinker_prepared(&s->s_shrink);
+	shrinker_register(s->s_shrink);
 	return s;
 }
 EXPORT_SYMBOL(sget);
@@ -1419,32 +1422,48 @@ EXPORT_SYMBOL(sget_dev);
 
 #ifdef CONFIG_BLOCK
 /*
- * Lock a super block that the callers holds a reference to.
+ * Lock the superblock that is holder of the bdev. Returns the superblock
+ * pointer if we successfully locked the superblock and it is alive. Otherwise
+ * we return NULL and just unlock bdev->bd_holder_lock.
  *
- * The caller needs to ensure that the super_block isn't being freed while
- * calling this function, e.g. by holding a lock over the call to this function
- * and the place that clears the pointer to the superblock used by this function
- * before freeing the superblock.
+ * The function must be called with bdev->bd_holder_lock and releases it.
  */
-static bool super_lock_shared_active(struct super_block *sb)
+static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+	__releases(&bdev->bd_holder_lock)
 {
-	bool born = super_lock_shared(sb);
+	struct super_block *sb = bdev->bd_holder;
+	bool born;
+
+	lockdep_assert_held(&bdev->bd_holder_lock);
+	lockdep_assert_not_held(&sb->s_umount);
+	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
 
+	/* Make sure sb doesn't go away from under us */
+	spin_lock(&sb_lock);
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	born = super_lock_shared(sb);
 	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
 		super_unlock_shared(sb);
-		return false;
+		put_super(sb);
+		return NULL;
 	}
-	return true;
+	/*
+	 * The superblock is active and we hold s_umount, we can drop our
+	 * temporary reference now.
+	 */
+	put_super(sb);
+	return sb;
 }
 
 static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 {
-	struct super_block *sb = bdev->bd_holder;
-
-	/* bd_holder_lock ensures that the sb isn't freed */
-	lockdep_assert_held(&bdev->bd_holder_lock);
+	struct super_block *sb;
 
-	if (!super_lock_shared_active(sb))
+	sb = bdev_super_lock_shared(bdev);
+	if (!sb)
 		return;
 
 	if (!surprise)
@@ -1459,11 +1478,10 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 
 static void fs_bdev_sync(struct block_device *bdev)
 {
-	struct super_block *sb = bdev->bd_holder;
-
-	lockdep_assert_held(&bdev->bd_holder_lock);
+	struct super_block *sb;
 
-	if (!super_lock_shared_active(sb))
+	sb = bdev_super_lock_shared(bdev);
+	if (!sb)
 		return;
 	sync_filesystem(sb);
 	super_unlock_shared(sb);
@@ -1479,14 +1497,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
 	blk_mode_t mode = sb_open_mode(sb_flags);
+	struct bdev_handle *bdev_handle;
 	struct block_device *bdev;
 
-	bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
-	if (IS_ERR(bdev)) {
+	bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_handle)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev);
+		return PTR_ERR(bdev_handle);
 	}
+	bdev = bdev_handle->bdev;
 
 	/*
 	 * This really should be in blkdev_get_by_dev, but right now can't due
@@ -1494,7 +1514,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EACCES;
 	}
 
@@ -1510,10 +1530,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(bdev_handle);
 		return -EBUSY;
 	}
 	spin_lock(&sb_lock);
+	sb->s_bdev_handle = bdev_handle;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
@@ -1522,7 +1543,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 
 	snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
-	shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+	shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
 				sb->s_id);
 	sb_set_blocksize(sb, block_size(bdev));
 	return 0;
@@ -1646,7 +1667,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		blkdev_put(bdev, sb);
+		bdev_release(sb->s_bdev_handle);
 	}
 }
 
@@ -2139,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb)
 		destroy_workqueue(wq);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index a12ac0356c69..6b7652fb8050 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
 	return battr->mmap(of->file, kobj, battr, vma);
 }
 
+static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
+				  int whence)
+{
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+
+	if (battr->llseek)
+		return battr->llseek(of->file, kobj, battr, offset, whence);
+	else
+		return generic_file_llseek(of->file, offset, whence);
+}
+
 static int sysfs_kf_bin_open(struct kernfs_open_file *of)
 {
 	struct bin_attribute *battr = of->kn->priv;
@@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
 	.write		= sysfs_kf_bin_write,
 	.mmap		= sysfs_kf_bin_mmap,
 	.open		= sysfs_kf_bin_open,
+	.llseek		= sysfs_kf_bin_llseek,
 };
 
 int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2f5ead88d00b..2e126d72d619 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
 	memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	err = sysv_handle_dirsync(dir);
 out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
 	}
 	de->inode = 0;
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	return sysv_handle_dirsync(inode);
 }
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
 	}
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
 	dir_commit_chunk(page, pos, SYSV_DIRSIZE);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	return sysv_handle_dirsync(inode);
 }
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6719da5889d9..269df6d49815 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
 	dirty_sb(sb);
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = fs16_to_cpu(sbi, ino);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_blocks = 0;
 	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
 	SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0aa3827d8178..5a915b2e68f5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -200,11 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
 	i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
 	set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
 	inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
-	inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
-	inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+	inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
+	inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
 	inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
 	inode->i_blocks = 0;
 
 	si = SYSV_I(inode);
@@ -253,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait)
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
 	raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
 	raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
-	raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
-	raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
-	raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
+	raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
+	raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
+	raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
 
 	si = SYSV_I(inode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index edb94e55de8e..725981474e5f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -423,7 +423,7 @@ do_indirects:
 		}
 		n++;
 	}
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		sysv_sync_inode (inode);
 	else
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 8c8d64e76103..f0677ea0ec24 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -2,8 +2,9 @@
 /*
  *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
  *
- *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org>
  *  Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *  Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org>
  *
  *  eventfs is used to dynamically create inodes and dentries based on the
  *  meta data provided by the tracing system.
@@ -23,48 +24,30 @@
 #include <linux/delay.h>
 #include "internal.h"
 
-struct eventfs_inode {
-	struct list_head	e_top_files;
-};
+/*
+ * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access
+ * to the ei->dentry must be done under this mutex and after checking
+ * if ei->is_freed is not set. When ei->is_freed is set, the dentry
+ * is on its way to being freed after the last dput() is made on it.
+ */
+static DEFINE_MUTEX(eventfs_mutex);
 
 /*
- * struct eventfs_file - hold the properties of the eventfs files and
- *                       directories.
- * @name:	the name of the file or directory to create
- * @d_parent:   holds parent's dentry
- * @dentry:     once accessed holds dentry
- * @list:	file or directory to be added to parent directory
- * @ei:		list of files and directories within directory
- * @fop:	file_operations for file or directory
- * @iop:	inode_operations for file or directory
- * @data:	something that the caller will want to get to later on
- * @mode:	the permission that the file or directory should have
+ * The eventfs_inode (ei) itself is protected by SRCU. It is released from
+ * its parent's list and will have is_freed set (under eventfs_mutex).
+ * After the SRCU grace period is over and the last dput() is called
+ * the ei is freed.
  */
-struct eventfs_file {
-	const char			*name;
-	struct dentry			*d_parent;
-	struct dentry			*dentry;
-	struct list_head		list;
-	struct eventfs_inode		*ei;
-	const struct file_operations	*fop;
-	const struct inode_operations	*iop;
-	/*
-	 * Union - used for deletion
-	 * @del_list:	list of eventfs_file to delete
-	 * @rcu:	eventfs_file to delete in RCU
-	 * @is_freed:	node is freed if one of the above is set
-	 */
-	union {
-		struct list_head	del_list;
-		struct rcu_head		rcu;
-		unsigned long		is_freed;
-	};
-	void				*data;
-	umode_t				mode;
+DEFINE_STATIC_SRCU(eventfs_srcu);
+
+/* Mode is unsigned short, use the upper bits for flags */
+enum {
+	EVENTFS_SAVE_MODE	= BIT(16),
+	EVENTFS_SAVE_UID	= BIT(17),
+	EVENTFS_SAVE_GID	= BIT(18),
 };
 
-static DEFINE_MUTEX(eventfs_mutex);
-DEFINE_STATIC_SRCU(eventfs_srcu);
+#define EVENTFS_MODE_MASK	(EVENTFS_SAVE_MODE - 1)
 
 static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
@@ -73,8 +56,95 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
 static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
 static int eventfs_release(struct inode *inode, struct file *file);
 
+static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
+{
+	unsigned int ia_valid = iattr->ia_valid;
+
+	if (ia_valid & ATTR_MODE) {
+		attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) |
+			(iattr->ia_mode & EVENTFS_MODE_MASK) |
+			EVENTFS_SAVE_MODE;
+	}
+	if (ia_valid & ATTR_UID) {
+		attr->mode |= EVENTFS_SAVE_UID;
+		attr->uid = iattr->ia_uid;
+	}
+	if (ia_valid & ATTR_GID) {
+		attr->mode |= EVENTFS_SAVE_GID;
+		attr->gid = iattr->ia_gid;
+	}
+}
+
+static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
+			    struct iattr *iattr)
+{
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei;
+	const char *name;
+	int ret;
+
+	mutex_lock(&eventfs_mutex);
+	ei = dentry->d_fsdata;
+	if (ei->is_freed) {
+		/* Do not allow changes if the event is about to be removed. */
+		mutex_unlock(&eventfs_mutex);
+		return -ENODEV;
+	}
+
+	/* Preallocate the children mode array if necessary */
+	if (!(dentry->d_inode->i_mode & S_IFDIR)) {
+		if (!ei->entry_attrs) {
+			ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+						  GFP_NOFS);
+			if (!ei->entry_attrs) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+
+	ret = simple_setattr(idmap, dentry, iattr);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * If this is a dir, then update the ei cache, only the file
+	 * mode is saved in the ei->m_children, and the ownership is
+	 * determined by the parent directory.
+	 */
+	if (dentry->d_inode->i_mode & S_IFDIR) {
+		/*
+		 * The events directory dentry is never freed, unless its
+		 * part of an instance that is deleted. It's attr is the
+		 * default for its child files and directories.
+		 * Do not update it. It's not used for its own mode or ownership
+		 */
+		if (!ei->is_events)
+			update_attr(&ei->attr, iattr);
+
+	} else {
+		name = dentry->d_name.name;
+
+		for (int i = 0; i < ei->nr_entries; i++) {
+			entry = &ei->entries[i];
+			if (strcmp(name, entry->name) == 0) {
+				update_attr(&ei->entry_attrs[i], iattr);
+				break;
+			}
+		}
+	}
+ out:
+	mutex_unlock(&eventfs_mutex);
+	return ret;
+}
+
 static const struct inode_operations eventfs_root_dir_inode_operations = {
 	.lookup		= eventfs_root_lookup,
+	.setattr	= eventfs_set_attr,
+};
+
+static const struct inode_operations eventfs_file_inode_operations = {
+	.setattr	= eventfs_set_attr,
 };
 
 static const struct file_operations eventfs_file_operations = {
@@ -85,26 +155,110 @@ static const struct file_operations eventfs_file_operations = {
 	.release	= eventfs_release,
 };
 
+/* Return the evenfs_inode of the "events" directory */
+static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
+{
+	struct eventfs_inode *ei;
+
+	mutex_lock(&eventfs_mutex);
+	do {
+		/* The parent always has an ei, except for events itself */
+		ei = dentry->d_parent->d_fsdata;
+
+		/*
+		 * If the ei is being freed, the ownership of the children
+		 * doesn't matter.
+		 */
+		if (ei->is_freed) {
+			ei = NULL;
+			break;
+		}
+
+		dentry = ei->dentry;
+	} while (!ei->is_events);
+	mutex_unlock(&eventfs_mutex);
+
+	return ei;
+}
+
+static void update_inode_attr(struct dentry *dentry, struct inode *inode,
+			      struct eventfs_attr *attr, umode_t mode)
+{
+	struct eventfs_inode *events_ei = eventfs_find_events(dentry);
+
+	if (!events_ei)
+		return;
+
+	inode->i_mode = mode;
+	inode->i_uid = events_ei->attr.uid;
+	inode->i_gid = events_ei->attr.gid;
+
+	if (!attr)
+		return;
+
+	if (attr->mode & EVENTFS_SAVE_MODE)
+		inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
+
+	if (attr->mode & EVENTFS_SAVE_UID)
+		inode->i_uid = attr->uid;
+
+	if (attr->mode & EVENTFS_SAVE_GID)
+		inode->i_gid = attr->gid;
+}
+
+static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
+{
+	struct eventfs_inode *ei_child;
+
+	/* at most we have events/system/event */
+	if (WARN_ON_ONCE(level > 3))
+		return;
+
+	ei->attr.gid = gid;
+
+	if (ei->entry_attrs) {
+		for (int i = 0; i < ei->nr_entries; i++) {
+			ei->entry_attrs[i].gid = gid;
+		}
+	}
+
+	/*
+	 * Only eventfs_inode with dentries are updated, make sure
+	 * all eventfs_inodes are updated. If one of the children
+	 * do not have a dentry, this function must traverse it.
+	 */
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
+				 srcu_read_lock_held(&eventfs_srcu)) {
+		if (!ei_child->dentry)
+			update_gid(ei_child, gid, level + 1);
+	}
+}
+
+void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
+{
+	struct eventfs_inode *ei = dentry->d_fsdata;
+	int idx;
+
+	idx = srcu_read_lock(&eventfs_srcu);
+	update_gid(ei, gid, 0);
+	srcu_read_unlock(&eventfs_srcu, idx);
+}
+
 /**
  * create_file - create a file in the tracefs filesystem
  * @name: the name of the file to create.
  * @mode: the permission that the file should have.
+ * @attr: saved attributes changed by user
  * @parent: parent dentry for this file.
  * @data: something that the caller will want to get to later on.
  * @fop: struct file_operations that should be used for this file.
  *
- * This is the basic "create a file" function for tracefs.  It allows for a
- * wide range of flexibility in creating a file.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function creates a dentry that represents a file in the eventsfs_inode
+ * directory. The inode.i_private pointer will point to @data in the open()
+ * call.
  */
 static struct dentry *create_file(const char *name, umode_t mode,
+				  struct eventfs_attr *attr,
 				  struct dentry *parent, void *data,
 				  const struct file_operations *fop)
 {
@@ -118,6 +272,7 @@ static struct dentry *create_file(const char *name, umode_t mode,
 	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
 
+	WARN_ON_ONCE(!parent);
 	dentry = eventfs_start_creating(name, parent);
 
 	if (IS_ERR(dentry))
@@ -127,7 +282,10 @@ static struct dentry *create_file(const char *name, umode_t mode,
 	if (unlikely(!inode))
 		return eventfs_failed_creating(dentry);
 
-	inode->i_mode = mode;
+	/* If the user updated the directory's attributes, use them */
+	update_inode_attr(dentry, inode, attr, mode);
+
+	inode->i_op = &eventfs_file_inode_operations;
 	inode->i_fop = fop;
 	inode->i_private = data;
 
@@ -140,28 +298,19 @@ static struct dentry *create_file(const char *name, umode_t mode,
 
 /**
  * create_dir - create a dir in the tracefs filesystem
- * @name: the name of the file to create.
+ * @ei: the eventfs_inode that represents the directory to create
  * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- *
- * This is the basic "create a dir" function for eventfs.  It allows for a
- * wide range of flexibility in creating a dir.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
  *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function will create a dentry for a directory represented by
+ * a eventfs_inode.
  */
-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
 {
 	struct tracefs_inode *ti;
 	struct dentry *dentry;
 	struct inode *inode;
 
-	dentry = eventfs_start_creating(name, parent);
+	dentry = eventfs_start_creating(ei->name, parent);
 	if (IS_ERR(dentry))
 		return dentry;
 
@@ -169,10 +318,12 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 	if (unlikely(!inode))
 		return eventfs_failed_creating(dentry);
 
-	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	/* If the user updated the directory's attributes, use them */
+	update_inode_attr(dentry, inode, &ei->attr,
+			  S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
+
 	inode->i_op = &eventfs_root_dir_inode_operations;
 	inode->i_fop = &eventfs_file_operations;
-	inode->i_private = data;
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
@@ -184,117 +335,192 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 	return eventfs_end_creating(dentry);
 }
 
+static void free_ei(struct eventfs_inode *ei)
+{
+	kfree_const(ei->name);
+	kfree(ei->d_children);
+	kfree(ei->entry_attrs);
+	kfree(ei);
+}
+
 /**
- * eventfs_set_ef_status_free - set the ef->status to free
+ * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
  * @ti: the tracefs_inode of the dentry
- * @dentry: dentry who's status to be freed
+ * @dentry: dentry which has the reference to remove.
  *
- * eventfs_set_ef_status_free will be called if no more
- * references remain
+ * Remove the association between a dentry from an eventfs_inode.
  */
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 {
-	struct tracefs_inode *ti_parent;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef, *tmp;
-
-	/* The top level events directory may be freed by this */
-	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
-		LIST_HEAD(ef_del_list);
+	int i;
 
-		mutex_lock(&eventfs_mutex);
+	mutex_lock(&eventfs_mutex);
 
-		ei = ti->private;
+	ei = dentry->d_fsdata;
+	if (!ei)
+		goto out;
 
-		/* Record all the top level files */
-		list_for_each_entry_srcu(ef, &ei->e_top_files, list,
-					 lockdep_is_held(&eventfs_mutex)) {
-			list_add_tail(&ef->del_list, &ef_del_list);
+	/* This could belong to one of the files of the ei */
+	if (ei->dentry != dentry) {
+		for (i = 0; i < ei->nr_entries; i++) {
+			if (ei->d_children[i] == dentry)
+				break;
 		}
+		if (WARN_ON_ONCE(i == ei->nr_entries))
+			goto out;
+		ei->d_children[i] = NULL;
+	} else if (ei->is_freed) {
+		free_ei(ei);
+	} else {
+		ei->dentry = NULL;
+	}
 
-		/* Nothing should access this, but just in case! */
-		ti->private = NULL;
+	dentry->d_fsdata = NULL;
+ out:
+	mutex_unlock(&eventfs_mutex);
+}
 
-		mutex_unlock(&eventfs_mutex);
+/**
+ * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * @ei: the eventfs_inode that the file will be created under
+ * @idx: the index into the d_children[] of the @ei
+ * @parent: The parent dentry of the created file.
+ * @name: The name of the file to create
+ * @mode: The mode of the file.
+ * @data: The data to use to set the inode of the file with on open()
+ * @fops: The fops of the file to be created.
+ * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
+ *
+ * Create a dentry for a file of an eventfs_inode @ei and place it into the
+ * address located at @e_dentry. If the @e_dentry already has a dentry, then
+ * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ */
+static struct dentry *
+create_file_dentry(struct eventfs_inode *ei, int idx,
+		   struct dentry *parent, const char *name, umode_t mode, void *data,
+		   const struct file_operations *fops, bool lookup)
+{
+	struct eventfs_attr *attr = NULL;
+	struct dentry **e_dentry = &ei->d_children[idx];
+	struct dentry *dentry;
 
-		/* Now safely free the top level files and their children */
-		list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-			list_del(&ef->del_list);
-			eventfs_remove(ef);
-		}
+	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
 
-		kfree(ei);
-		return;
+	mutex_lock(&eventfs_mutex);
+	if (ei->is_freed) {
+		mutex_unlock(&eventfs_mutex);
+		return NULL;
+	}
+	/* If the e_dentry already has a dentry, use it */
+	if (*e_dentry) {
+		/* lookup does not need to up the ref count */
+		if (!lookup)
+			dget(*e_dentry);
+		mutex_unlock(&eventfs_mutex);
+		return *e_dentry;
 	}
 
-	mutex_lock(&eventfs_mutex);
+	/* ei->entry_attrs are protected by SRCU */
+	if (ei->entry_attrs)
+		attr = &ei->entry_attrs[idx];
 
-	ti_parent = get_tracefs(dentry->d_parent->d_inode);
-	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
-		goto out;
+	mutex_unlock(&eventfs_mutex);
 
-	ef = dentry->d_fsdata;
-	if (!ef)
-		goto out;
+	dentry = create_file(name, mode, attr, parent, data, fops);
 
-	/*
-	 * If ef was freed, then the LSB bit is set for d_fsdata.
-	 * But this should not happen, as it should still have a
-	 * ref count that prevents it. Warn in case it does.
-	 */
-	if (WARN_ON_ONCE((unsigned long)ef & 1))
-		goto out;
+	mutex_lock(&eventfs_mutex);
 
-	dentry->d_fsdata = NULL;
-	ef->dentry = NULL;
-out:
+	if (IS_ERR_OR_NULL(dentry)) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * If ei->is_freed is set, the e_dentry is currently on its
+		 * way to being freed, don't return it. If e_dentry is NULL
+		 * it means it was already freed.
+		 */
+		if (ei->is_freed)
+			dentry = NULL;
+		else
+			dentry = *e_dentry;
+		/* The lookup does not need to up the dentry refcount */
+		if (dentry && !lookup)
+			dget(dentry);
+		mutex_unlock(&eventfs_mutex);
+		return dentry;
+	}
+
+	if (!*e_dentry && !ei->is_freed) {
+		*e_dentry = dentry;
+		dentry->d_fsdata = ei;
+	} else {
+		/*
+		 * Should never happen unless we get here due to being freed.
+		 * Otherwise it means two dentries exist with the same name.
+		 */
+		WARN_ON_ONCE(!ei->is_freed);
+		dentry = NULL;
+	}
 	mutex_unlock(&eventfs_mutex);
+
+	if (lookup)
+		dput(dentry);
+
+	return dentry;
 }
 
 /**
  * eventfs_post_create_dir - post create dir routine
- * @ef: eventfs_file of recently created dir
+ * @ei: eventfs_inode of recently created dir
  *
  * Map the meta-data of files within an eventfs dir to their parent dentry
  */
-static void eventfs_post_create_dir(struct eventfs_file *ef)
+static void eventfs_post_create_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 
+	lockdep_assert_held(&eventfs_mutex);
+
 	/* srcu lock already held */
 	/* fill parent-child relation */
-	list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		ef_child->d_parent = ef->dentry;
+		ei_child->d_parent = ei->dentry;
 	}
 
-	ti = get_tracefs(ef->dentry->d_inode);
-	ti->private = ef->ei;
+	ti = get_tracefs(ei->dentry->d_inode);
+	ti->private = ei;
 }
 
 /**
- * create_dentry - helper function to create dentry
- * @ef: eventfs_file of file or directory to create
- * @parent: parent dentry
- * @lookup: true if called from lookup routine
+ * create_dir_dentry - Create a directory dentry for the eventfs_inode
+ * @pei: The eventfs_inode parent of ei.
+ * @ei: The eventfs_inode to create the directory for
+ * @parent: The dentry of the parent of this directory
+ * @lookup: True if this is called by the lookup code
  *
- * Used to create a dentry for file/dir, executes post dentry creation routine
+ * This creates and attaches a directory dentry to the eventfs_inode @ei.
  */
 static struct dentry *
-create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
+		  struct dentry *parent, bool lookup)
 {
-	bool invalidate = false;
-	struct dentry *dentry;
+	struct dentry *dentry = NULL;
+
+	WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
 
 	mutex_lock(&eventfs_mutex);
-	if (ef->is_freed) {
+	if (pei->is_freed || ei->is_freed) {
 		mutex_unlock(&eventfs_mutex);
 		return NULL;
 	}
-	if (ef->dentry) {
-		dentry = ef->dentry;
-		/* On dir open, up the ref count */
+	if (ei->dentry) {
+		/* If the dentry already has a dentry, use it */
+		dentry = ei->dentry;
+		/* lookup does not need to up the ref count */
 		if (!lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
@@ -302,97 +528,134 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
 	}
 	mutex_unlock(&eventfs_mutex);
 
-	if (!lookup)
-		inode_lock(parent->d_inode);
-
-	if (ef->ei)
-		dentry = create_dir(ef->name, parent, ef->data);
-	else
-		dentry = create_file(ef->name, ef->mode, parent,
-				     ef->data, ef->fop);
-
-	if (!lookup)
-		inode_unlock(parent->d_inode);
+	dentry = create_dir(ei, parent);
 
 	mutex_lock(&eventfs_mutex);
-	if (IS_ERR_OR_NULL(dentry)) {
-		/* If the ef was already updated get it */
-		dentry = ef->dentry;
+
+	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * If ei->is_freed is set, the e_dentry is currently on its
+		 * way to being freed.
+		 */
+		dentry = ei->dentry;
 		if (dentry && !lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
 		return dentry;
 	}
 
-	if (!ef->dentry && !ef->is_freed) {
-		ef->dentry = dentry;
-		if (ef->ei)
-			eventfs_post_create_dir(ef);
-		dentry->d_fsdata = ef;
+	if (!ei->dentry && !ei->is_freed) {
+		ei->dentry = dentry;
+		eventfs_post_create_dir(ei);
+		dentry->d_fsdata = ei;
 	} else {
-		/* A race here, should try again (unless freed) */
-		invalidate = true;
-
 		/*
 		 * Should never happen unless we get here due to being freed.
 		 * Otherwise it means two dentries exist with the same name.
 		 */
-		WARN_ON_ONCE(!ef->is_freed);
+		WARN_ON_ONCE(!ei->is_freed);
+		dentry = NULL;
 	}
 	mutex_unlock(&eventfs_mutex);
-	if (invalidate)
-		d_invalidate(dentry);
 
-	if (lookup || invalidate)
+	if (lookup)
 		dput(dentry);
 
-	return invalidate ? NULL : dentry;
-}
-
-static bool match_event_file(struct eventfs_file *ef, const char *name)
-{
-	bool ret;
-
-	mutex_lock(&eventfs_mutex);
-	ret = !ef->is_freed && strcmp(ef->name, name) == 0;
-	mutex_unlock(&eventfs_mutex);
-
-	return ret;
+	return dentry;
 }
 
 /**
  * eventfs_root_lookup - lookup routine to create file/dir
  * @dir: in which a lookup is being done
  * @dentry: file/dir dentry
- * @flags: to pass as flags parameter to simple lookup
+ * @flags: Just passed to simple_lookup()
  *
- * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
- * list of meta data to find the information needed to create the file/dir.
+ * Used to create dynamic file/dir with-in @dir, search with-in @ei
+ * list, if @dentry found go ahead and create the file/dir
  */
+
 static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
+	struct dentry *ei_dentry = NULL;
 	struct dentry *ret = NULL;
+	const char *name = dentry->d_name.name;
+	bool created = false;
+	umode_t mode;
+	void *data;
 	int idx;
+	int i;
+	int r;
 
 	ti = get_tracefs(dir);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
 		return NULL;
 
-	ei = ti->private;
+	/* Grab srcu to prevent the ei from going away */
 	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+
+	/*
+	 * Grab the eventfs_mutex to consistent value from ti->private.
+	 * This s
+	 */
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	if (ei && !ei->is_freed)
+		ei_dentry = READ_ONCE(ei->dentry);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei || !ei_dentry)
+		goto out;
+
+	data = ei->data;
+
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		if (!match_event_file(ef, dentry->d_name.name))
+		if (strcmp(ei_child->name, name) != 0)
 			continue;
 		ret = simple_lookup(dir, dentry, flags);
-		create_dentry(ef, ef->d_parent, true);
+		if (IS_ERR(ret))
+			goto out;
+		create_dir_dentry(ei, ei_child, ei_dentry, true);
+		created = true;
 		break;
 	}
+
+	if (created)
+		goto out;
+
+	for (i = 0; i < ei->nr_entries; i++) {
+		entry = &ei->entries[i];
+		if (strcmp(name, entry->name) == 0) {
+			void *cdata = data;
+			mutex_lock(&eventfs_mutex);
+			/* If ei->is_freed, then the event itself may be too */
+			if (!ei->is_freed)
+				r = entry->callback(name, &mode, &cdata, &fops);
+			else
+				r = -1;
+			mutex_unlock(&eventfs_mutex);
+			if (r <= 0)
+				continue;
+			ret = simple_lookup(dir, dentry, flags);
+			if (IS_ERR(ret))
+				goto out;
+			create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
+					   fops, true);
+			break;
+		}
+	}
+ out:
 	srcu_read_unlock(&eventfs_srcu, idx);
 	return ret;
 }
@@ -432,29 +695,48 @@ static int eventfs_release(struct inode *inode, struct file *file)
 	return dcache_dir_close(inode, file);
 }
 
+static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
+{
+	struct dentry **tmp;
+
+	tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS);
+	if (!tmp)
+		return -1;
+	tmp[cnt] = d;
+	tmp[cnt + 1] = NULL;
+	*dentries = tmp;
+	return 0;
+}
+
 /**
  * dcache_dir_open_wrapper - eventfs open wrapper
  * @inode: not used
- * @file: dir to be opened (to create its child)
+ * @file: dir to be opened (to create it's children)
  *
- * Used to dynamically create the file/dir within @file. @file is really a
- * directory and all the files/dirs of the children within @file will be
- * created. If any of the files/dirs have already been created, their
- * reference count will be incremented.
+ * Used to dynamic create file/dir with-in @file, all the
+ * file/dir will be created. If already created then references
+ * will be increased
  */
 static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
 	struct dentry_list *dlist;
 	struct dentry **dentries = NULL;
-	struct dentry *dentry = file_dentry(file);
+	struct dentry *parent = file_dentry(file);
 	struct dentry *d;
 	struct inode *f_inode = file_inode(file);
+	const char *name = parent->d_name.name;
+	umode_t mode;
+	void *data;
 	int cnt = 0;
 	int idx;
 	int ret;
+	int i;
+	int r;
 
 	ti = get_tracefs(f_inode);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
@@ -463,27 +745,60 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 	if (WARN_ON_ONCE(file->private_data))
 		return -EINVAL;
 
+	idx = srcu_read_lock(&eventfs_srcu);
+
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei) {
+		srcu_read_unlock(&eventfs_srcu, idx);
+		return -EINVAL;
+	}
+
+
+	data = ei->data;
+
 	dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
-	if (!dlist)
+	if (!dlist) {
+		srcu_read_unlock(&eventfs_srcu, idx);
 		return -ENOMEM;
+	}
 
-	ei = ti->private;
-	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+	inode_lock(parent->d_inode);
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		d = create_dentry(ef, dentry, false);
+		d = create_dir_dentry(ei, ei_child, parent, false);
 		if (d) {
-			struct dentry **tmp;
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
+				break;
+			cnt++;
+		}
+	}
 
-			tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
-			if (!tmp)
+	for (i = 0; i < ei->nr_entries; i++) {
+		void *cdata = data;
+		entry = &ei->entries[i];
+		name = entry->name;
+		mutex_lock(&eventfs_mutex);
+		/* If ei->is_freed, then the event itself may be too */
+		if (!ei->is_freed)
+			r = entry->callback(name, &mode, &cdata, &fops);
+		else
+			r = -1;
+		mutex_unlock(&eventfs_mutex);
+		if (r <= 0)
+			continue;
+		d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
+		if (d) {
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
 				break;
-			tmp[cnt] = d;
-			tmp[cnt + 1] = NULL;
 			cnt++;
-			dentries = tmp;
 		}
 	}
+	inode_unlock(parent->d_inode);
 	srcu_read_unlock(&eventfs_srcu, idx);
 	ret = dcache_dir_open(inode, file);
 
@@ -514,287 +829,253 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
 }
 
 /**
- * eventfs_prepare_ef - helper function to prepare eventfs_file
- * @name: the name of the file/directory to create.
- * @mode: the permission that the file should have.
- * @fop: struct file_operations that should be used for this file/directory.
- * @iop: struct inode_operations that should be used for this file/directory.
- * @data: something that the caller will want to get to later on. The
- *        inode.i_private pointer will point to this value on the open() call.
+ * eventfs_create_dir - Create the eventfs_inode for this directory
+ * @name: The name of the directory to create.
+ * @parent: The eventfs_inode of the parent directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
+ *
+ * This function creates the descriptor to represent a directory in the
+ * eventfs. This descriptor is an eventfs_inode, and it is returned to be
+ * used to create other children underneath.
  *
- * This function allocates and fills the eventfs_file structure.
+ * The @entries is an array of eventfs_entry structures which has:
+ *	const char		 *name
+ *	eventfs_callback	callback;
+ *
+ * The name is the name of the file, and the callback is a pointer to a function
+ * that will be called when the file is reference (either by lookup or by
+ * reading a directory). The callback is of the prototype:
+ *
+ *    int callback(const char *name, umode_t *mode, void **data,
+ *		   const struct file_operations **fops);
+ *
+ * When a file needs to be created, this callback will be called with
+ *   name = the name of the file being created (so that the same callback
+ *          may be used for multiple files).
+ *   mode = a place to set the file's mode
+ *   data = A pointer to @data, and the callback may replace it, which will
+ *         cause the file created to pass the new data to the open() call.
+ *   fops = the fops to use for the created file.
+ *
+ * NB. @callback is called while holding internal locks of the eventfs
+ *     system. The callback must not call any code that might also call into
+ *     the tracefs or eventfs system or it will risk creating a deadlock.
  */
-static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
-					const struct file_operations *fop,
-					const struct inode_operations *iop,
-					void *data)
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data)
 {
-	struct eventfs_file *ef;
+	struct eventfs_inode *ei;
 
-	ef = kzalloc(sizeof(*ef), GFP_KERNEL);
-	if (!ef)
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
 		return ERR_PTR(-ENOMEM);
 
-	ef->name = kstrdup(name, GFP_KERNEL);
-	if (!ef->name) {
-		kfree(ef);
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name) {
+		kfree(ei);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (S_ISDIR(mode)) {
-		ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
-		if (!ef->ei) {
-			kfree(ef->name);
-			kfree(ef);
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children) {
+			kfree_const(ei->name);
+			kfree(ei);
 			return ERR_PTR(-ENOMEM);
 		}
-		INIT_LIST_HEAD(&ef->ei->e_top_files);
-	} else {
-		ef->ei = NULL;
 	}
 
-	ef->iop = iop;
-	ef->fop = fop;
-	ef->mode = mode;
-	ef->data = data;
-	return ef;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->data = data;
+	INIT_LIST_HEAD(&ei->children);
+	INIT_LIST_HEAD(&ei->list);
+
+	mutex_lock(&eventfs_mutex);
+	if (!parent->is_freed) {
+		list_add_tail(&ei->list, &parent->children);
+		ei->d_parent = parent->dentry;
+	}
+	mutex_unlock(&eventfs_mutex);
+
+	/* Was the parent freed? */
+	if (list_empty(&ei->list)) {
+		free_ei(ei);
+		ei = NULL;
+	}
+	return ei;
 }
 
 /**
- * eventfs_create_events_dir - create the trace event structure
- * @name: the name of the directory to create.
- * @parent: parent dentry for this file.  This should be a directory dentry
- *          if set.  If this parameter is NULL, then the directory will be
- *          created in the root of the tracefs filesystem.
+ * eventfs_create_events_dir - create the top level events directory
+ * @name: The name of the top level directory to create.
+ * @parent: Parent dentry for this file in the tracefs directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
  *
  * This function creates the top of the trace event directory.
+ *
+ * See eventfs_create_dir() for use of @entries.
  */
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent)
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data)
 {
 	struct dentry *dentry = tracefs_start_creating(name, parent);
 	struct eventfs_inode *ei;
 	struct tracefs_inode *ti;
 	struct inode *inode;
+	kuid_t uid;
+	kgid_t gid;
 
 	if (security_locked_down(LOCKDOWN_TRACEFS))
 		return NULL;
 
 	if (IS_ERR(dentry))
-		return dentry;
+		return ERR_CAST(dentry);
 
 	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
 	if (!ei)
-		return ERR_PTR(-ENOMEM);
+		goto fail_ei;
+
 	inode = tracefs_get_inode(dentry->d_sb);
-	if (unlikely(!inode)) {
-		kfree(ei);
-		tracefs_failed_creating(dentry);
-		return ERR_PTR(-ENOMEM);
+	if (unlikely(!inode))
+		goto fail;
+
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children)
+			goto fail;
 	}
 
-	INIT_LIST_HEAD(&ei->e_top_files);
+	ei->dentry = dentry;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->is_events = 1;
+	ei->data = data;
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name)
+		goto fail;
+
+	/* Save the ownership of this directory */
+	uid = d_inode(dentry->d_parent)->i_uid;
+	gid = d_inode(dentry->d_parent)->i_gid;
+
+	/* This is used as the default ownership of the files and directories */
+	ei->attr.uid = uid;
+	ei->attr.gid = gid;
+
+	INIT_LIST_HEAD(&ei->children);
+	INIT_LIST_HEAD(&ei->list);
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
 	ti->private = ei;
 
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	inode->i_uid = uid;
+	inode->i_gid = gid;
 	inode->i_op = &eventfs_root_dir_inode_operations;
 	inode->i_fop = &eventfs_file_operations;
 
+	dentry->d_fsdata = ei;
+
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
 	d_instantiate(dentry, inode);
 	inc_nlink(dentry->d_parent->d_inode);
 	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return tracefs_end_creating(dentry);
-}
-
-/**
- * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
- * @name: the name of the file to create.
- * @parent: parent dentry for this dir.
- *
- * This function adds eventfs subsystem dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent)
-{
-	struct tracefs_inode *ti_parent;
-	struct eventfs_inode *ei_parent;
-	struct eventfs_file *ef;
+	tracefs_end_creating(dentry);
 
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
+	return ei;
 
-	if (!parent)
-		return ERR_PTR(-EINVAL);
-
-	ti_parent = get_tracefs(parent->d_inode);
-	ei_parent = ti_parent->private;
-
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei_parent->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
+ fail:
+	kfree(ei->d_children);
+	kfree(ei);
+ fail_ei:
+	tracefs_failed_creating(dentry);
+	return ERR_PTR(-ENOMEM);
 }
 
-/**
- * eventfs_add_dir - add eventfs dir to list to create later
- * @name: the name of the file to create.
- * @ef_parent: parent eventfs_file for this dir.
- *
- * This function adds eventfs dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent)
-{
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
-
-	if (!ef_parent)
-		return ERR_PTR(-EINVAL);
-
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
+static LLIST_HEAD(free_list);
 
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
-}
-
-/**
- * eventfs_add_events_file - add the data needed to create a file for later reference
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * dentry/inode within the top level events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fop)
+static void eventfs_workfn(struct work_struct *work)
 {
-	struct tracefs_inode *ti;
-	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
-
-	if (!parent)
-		return -EINVAL;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-
-	if (!parent->d_inode)
-		return -EINVAL;
-
-	ti = get_tracefs(parent->d_inode);
-	if (!(ti->flags & TRACEFS_EVENT_INODE))
-		return -EINVAL;
-
-	ei = ti->private;
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-
-	if (IS_ERR(ef))
-		return -ENOMEM;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
+        struct eventfs_inode *ei, *tmp;
+        struct llist_node *llnode;
+
+	llnode = llist_del_all(&free_list);
+        llist_for_each_entry_safe(ei, tmp, llnode, llist) {
+		/* This dput() matches the dget() from unhook_dentry() */
+		for (int i = 0; i < ei->nr_entries; i++) {
+			if (ei->d_children[i])
+				dput(ei->d_children[i]);
+		}
+		/* This should only get here if it had a dentry */
+		if (!WARN_ON_ONCE(!ei->dentry))
+			dput(ei->dentry);
+        }
 }
 
-/**
- * eventfs_add_file - add eventfs file to list to create later
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @ef_parent: parent eventfs_file for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * file within a subdirectory of the events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent,
-		     void *data,
-		     const struct file_operations *fop)
-{
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
+static DECLARE_WORK(eventfs_work, eventfs_workfn);
 
-	if (!ef_parent)
-		return -EINVAL;
+static void free_rcu_ei(struct rcu_head *head)
+{
+	struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
 
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
+	if (ei->dentry) {
+		/* Do not free the ei until all references of dentry are gone */
+		if (llist_add(&ei->llist, &free_list))
+			queue_work(system_unbound_wq, &eventfs_work);
+		return;
+	}
 
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-	if (IS_ERR(ef))
-		return -ENOMEM;
+	/* If the ei doesn't have a dentry, neither should its children */
+	for (int i = 0; i < ei->nr_entries; i++) {
+		WARN_ON_ONCE(ei->d_children[i]);
+	}
 
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
+	free_ei(ei);
 }
 
-static void free_ef(struct rcu_head *head)
+static void unhook_dentry(struct dentry *dentry)
 {
-	struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
+	if (!dentry)
+		return;
+	/*
+	 * Need to add a reference to the dentry that is expected by
+	 * simple_recursive_removal(), which will include a dput().
+	 */
+	dget(dentry);
 
-	kfree(ef->name);
-	kfree(ef->ei);
-	kfree(ef);
+	/*
+	 * Also add a reference for the dput() in eventfs_workfn().
+	 * That is required as that dput() will free the ei after
+	 * the SRCU grace period is over.
+	 */
+	dget(dentry);
 }
 
 /**
  * eventfs_remove_rec - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
- * @head: to create list of eventfs_file to be deleted
- * @level: to check recursion depth
+ * @ei: eventfs_inode to be removed.
+ * @level: prevent recursion from going more than 3 levels deep.
  *
- * The helper function eventfs_remove_rec() is used to clean up and free the
- * associated data from eventfs for both of the added functions.
+ * This function recursively removes eventfs_inodes which
+ * contains info of files and/or directories.
  */
-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 
-	if (!ef)
+	if (!ei)
 		return;
 	/*
 	 * Check recursion depth. It should never be greater than 3:
@@ -806,100 +1087,76 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head,
 	if (WARN_ON_ONCE(level > 3))
 		return;
 
-	if (ef->ei) {
-		/* search for nested folders or files */
-		list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
-					 lockdep_is_held(&eventfs_mutex)) {
-			eventfs_remove_rec(ef_child, head, level + 1);
+	/* search for nested folders or files */
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
+				 lockdep_is_held(&eventfs_mutex)) {
+		/* Children only have dentry if parent does */
+		WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+		eventfs_remove_rec(ei_child, level + 1);
+	}
+
+
+	ei->is_freed = 1;
+
+	for (int i = 0; i < ei->nr_entries; i++) {
+		if (ei->d_children[i]) {
+			/* Children only have dentry if parent does */
+			WARN_ON_ONCE(!ei->dentry);
+			unhook_dentry(ei->d_children[i]);
 		}
 	}
 
-	list_del_rcu(&ef->list);
-	list_add_tail(&ef->del_list, head);
+	unhook_dentry(ei->dentry);
+
+	list_del_rcu(&ei->list);
+	call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
 }
 
 /**
- * eventfs_remove - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
+ * eventfs_remove_dir - remove eventfs dir or file from list
+ * @ei: eventfs_inode to be removed.
  *
  * This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
  */
-void eventfs_remove(struct eventfs_file *ef)
+void eventfs_remove_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *tmp;
-	LIST_HEAD(ef_del_list);
-	struct dentry *dentry_list = NULL;
 	struct dentry *dentry;
 
-	if (!ef)
+	if (!ei)
 		return;
 
 	mutex_lock(&eventfs_mutex);
-	eventfs_remove_rec(ef, &ef_del_list, 0);
-	list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-		if (ef->dentry) {
-			unsigned long ptr = (unsigned long)dentry_list;
-
-			/* Keep the dentry from being freed yet */
-			dget(ef->dentry);
-
-			/*
-			 * Paranoid: The dget() above should prevent the dentry
-			 * from being freed and calling eventfs_set_ef_status_free().
-			 * But just in case, set the link list LSB pointer to 1
-			 * and have eventfs_set_ef_status_free() check that to
-			 * make sure that if it does happen, it will not think
-			 * the d_fsdata is an event_file.
-			 *
-			 * For this to work, no event_file should be allocated
-			 * on a odd space, as the ef should always be allocated
-			 * to be at least word aligned. Check for that too.
-			 */
-			WARN_ON_ONCE(ptr & 1);
-
-			ef->dentry->d_fsdata = (void *)(ptr | 1);
-			dentry_list = ef->dentry;
-			ef->dentry = NULL;
-		}
-		call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
-	}
+	dentry = ei->dentry;
+	eventfs_remove_rec(ei, 0);
 	mutex_unlock(&eventfs_mutex);
 
-	while (dentry_list) {
-		unsigned long ptr;
-
-		dentry = dentry_list;
-		ptr = (unsigned long)dentry->d_fsdata & ~1UL;
-		dentry_list = (struct dentry *)ptr;
-		dentry->d_fsdata = NULL;
-		d_invalidate(dentry);
-		mutex_lock(&eventfs_mutex);
-		/* dentry should now have at least a single reference */
-		WARN_ONCE((int)d_count(dentry) < 1,
-			  "dentry %p less than one reference (%d) after invalidate\n",
-			  dentry, d_count(dentry));
-		mutex_unlock(&eventfs_mutex);
-		dput(dentry);
-	}
+	/*
+	 * If any of the ei children has a dentry, then the ei itself
+	 * must have a dentry.
+	 */
+	if (dentry)
+		simple_recursive_removal(dentry, NULL);
 }
 
 /**
- * eventfs_remove_events_dir - remove eventfs dir or file from list
- * @dentry: events's dentry to be removed.
+ * eventfs_remove_events_dir - remove the top level eventfs directory
+ * @ei: the event_inode returned by eventfs_create_events_dir().
  *
- * This function remove events main directory
+ * This function removes the events main directory
  */
-void eventfs_remove_events_dir(struct dentry *dentry)
+void eventfs_remove_events_dir(struct eventfs_inode *ei)
 {
-	struct tracefs_inode *ti;
-
-	if (!dentry || !dentry->d_inode)
-		return;
+	struct dentry *dentry;
 
-	ti = get_tracefs(dentry->d_inode);
-	if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
-		return;
+	dentry = ei->dentry;
+	eventfs_remove_dir(ei);
 
-	d_invalidate(dentry);
+	/*
+	 * Matches the dget() done by tracefs_start_creating()
+	 * in eventfs_create_events_dir() when it the dentry was
+	 * created. In other words, it's a normal dentry that
+	 * sticks around while the other ei->dentry are created
+	 * and destroyed dynamically.
+	 */
 	dput(dentry);
 }
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..bc86ffdb103b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -152,7 +152,7 @@ struct inode *tracefs_get_inode(struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_ino = get_next_ino();
-		inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+		simple_inode_init_ts(inode);
 	}
 	return inode;
 }
@@ -210,14 +210,24 @@ repeat:
 	next = this_parent->d_subdirs.next;
 resume:
 	while (next != &this_parent->d_subdirs) {
+		struct tracefs_inode *ti;
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
 
+		/* Note, getdents() can add a cursor dentry with no inode */
+		if (!dentry->d_inode)
+			continue;
+
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 
 		change_gid(dentry, gid);
 
+		/* If this is the events directory, update that too */
+		ti = get_tracefs(dentry->d_inode);
+		if (ti && (ti->flags & TRACEFS_EVENT_INODE))
+			eventfs_update_gid(dentry, gid);
+
 		if (!list_empty(&dentry->d_subdirs)) {
 			spin_unlock(&this_parent->d_lock);
 			spin_release(&dentry->d_lock.dep_map, _RET_IP_);
@@ -385,7 +395,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 	ti = get_tracefs(inode);
 	if (ti && ti->flags & TRACEFS_EVENT_INODE)
-		eventfs_set_ef_status_free(ti, dentry);
+		eventfs_set_ei_status_free(ti, dentry);
 	iput(inode);
 }
 
@@ -509,20 +519,15 @@ struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
 	struct dentry *dentry;
 	int error;
 
+	/* Must always have a parent. */
+	if (WARN_ON_ONCE(!parent))
+		return ERR_PTR(-EINVAL);
+
 	error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
 			      &tracefs_mount_count);
 	if (error)
 		return ERR_PTR(error);
 
-	/*
-	 * If the parent is not specified, we create it in the root.
-	 * We need the root dentry to do this, which is in the super
-	 * block. A pointer to that is in the struct vfsmount that we
-	 * have around.
-	 */
-	if (!parent)
-		parent = tracefs_mount->mnt_root;
-
 	if (unlikely(IS_DEADDIR(parent->d_inode)))
 		dentry = ERR_PTR(-ENOENT);
 	else
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 4f2e49e2197b..42bdeb471a07 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -13,6 +13,59 @@ struct tracefs_inode {
 	struct inode            vfs_inode;
 };
 
+/*
+ * struct eventfs_attr - cache the mode and ownership of a eventfs entry
+ * @mode:	saved mode plus flags of what is saved
+ * @uid:	saved uid if changed
+ * @gid:	saved gid if changed
+ */
+struct eventfs_attr {
+	int				mode;
+	kuid_t				uid;
+	kgid_t				gid;
+};
+
+/*
+ * struct eventfs_inode - hold the properties of the eventfs directories.
+ * @list:	link list into the parent directory
+ * @entries:	the array of entries representing the files in the directory
+ * @name:	the name of the directory to create
+ * @children:	link list into the child eventfs_inode
+ * @dentry:     the dentry of the directory
+ * @d_parent:   pointer to the parent's dentry
+ * @d_children: The array of dentries to represent the files when created
+ * @entry_attrs: Saved mode and ownership of the @d_children
+ * @attr:	Saved mode and ownership of eventfs_inode itself
+ * @data:	The private data to pass to the callbacks
+ * @is_freed:	Flag set if the eventfs is on its way to be freed
+ *                Note if is_freed is set, then dentry is corrupted.
+ * @nr_entries: The number of items in @entries
+ */
+struct eventfs_inode {
+	struct list_head		list;
+	const struct eventfs_entry	*entries;
+	const char			*name;
+	struct list_head		children;
+	struct dentry			*dentry; /* Check is_freed to access */
+	struct dentry			*d_parent;
+	struct dentry			**d_children;
+	struct eventfs_attr		*entry_attrs;
+	struct eventfs_attr		attr;
+	void				*data;
+	/*
+	 * Union - used for deletion
+	 * @llist:	for calling dput() if needed after RCU
+	 * @rcu:	eventfs_inode to delete in RCU
+	 */
+	union {
+		struct llist_node	llist;
+		struct rcu_head		rcu;
+	};
+	unsigned int			is_freed:1;
+	unsigned int			is_events:1;
+	unsigned int			nr_entries:30;
+};
+
 static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
 {
 	return container_of(inode, struct tracefs_inode, vfs_inode);
@@ -25,6 +78,7 @@ struct inode *tracefs_get_inode(struct super_block *sb);
 struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
 struct dentry *eventfs_failed_creating(struct dentry *dentry);
 struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index e564d5ff8781..0d561ecb6869 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -9,10 +9,9 @@
  * This file implements various helper functions for UBIFS authentication support
  */
 
-#include <linux/crypto.h>
 #include <linux/verification.h>
 #include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
 #include <keys/user-type.h>
 #include <keys/asymmetric-type.h>
 
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3125e76376ee..921f9033d0d2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 }
 
 const struct fscrypt_operations ubifs_crypt_operations = {
-	.flags			= FS_CFLG_OWN_PAGES,
-	.key_prefix		= "ubifs:",
+	.legacy_key_prefix	= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.empty_dir		= ubifs_crypt_empty_dir,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef9e527d9ff..d013c5b3f1ed 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 	pr_err("\tuid            %u\n", (unsigned int)i_uid_read(inode));
 	pr_err("\tgid            %u\n", (unsigned int)i_gid_read(inode));
 	pr_err("\tatime          %u.%u\n",
-	       (unsigned int)inode->i_atime.tv_sec,
-	       (unsigned int)inode->i_atime.tv_nsec);
+	       (unsigned int) inode_get_atime_sec(inode),
+	       (unsigned int) inode_get_atime_nsec(inode));
 	pr_err("\tmtime          %u.%u\n",
-	       (unsigned int)inode->i_mtime.tv_sec,
-	       (unsigned int)inode->i_mtime.tv_nsec);
+	       (unsigned int) inode_get_mtime_sec(inode),
+	       (unsigned int) inode_get_mtime_nsec(inode));
 	pr_err("\tctime          %u.%u\n",
-	       (unsigned int) inode_get_ctime(inode).tv_sec,
-	       (unsigned int) inode_get_ctime(inode).tv_nsec);
+	       (unsigned int) inode_get_ctime_sec(inode),
+	       (unsigned int) inode_get_ctime_nsec(inode));
 	pr_err("\tcreat_sqnum    %llu\n", ui->creat_sqnum);
 	pr_err("\txattr_size     %u\n", ui->xattr_size);
 	pr_err("\txattr_cnt      %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 2f48c58d47cd..3b13c648d490 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,7 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 	inode->i_flags |= S_NOCMTIME;
 
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_mapping->nrpages = 0;
 
 	if (!is_xattr) {
@@ -324,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -724,7 +725,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = d_inode(old_dentry);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
-	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, sz_change;
 	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
 				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
 	struct fscrypt_name nm;
@@ -748,6 +749,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		return err;
 
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
 		goto out_fname;
@@ -767,7 +770,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	inode_set_ctime_current(inode);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -841,7 +845,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	drop_nlink(inode);
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
@@ -944,7 +949,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	drop_nlink(dir);
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
@@ -1018,7 +1024,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	inc_nlink(dir);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err) {
 		ubifs_err(c, "cannot create directory, error %d", err);
@@ -1109,7 +1116,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
@@ -1209,7 +1217,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
 	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5382f0b2587..2d2b39f843ce 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1088,9 +1088,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
 	if (attr->ia_valid & ATTR_GID)
 		inode->i_gid = attr->ia_gid;
 	if (attr->ia_valid & ATTR_ATIME)
-		inode->i_atime = attr->ia_atime;
+		inode_set_atime_to_ts(inode, attr->ia_atime);
 	if (attr->ia_valid & ATTR_MTIME)
-		inode->i_mtime = attr->ia_mtime;
+		inode_set_mtime_to_ts(inode, attr->ia_mtime);
 	if (attr->ia_valid & ATTR_CTIME)
 		inode_set_ctime_to_ts(inode, attr->ia_ctime);
 	if (attr->ia_valid & ATTR_MODE) {
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	mutex_lock(&ui->ui_mutex);
 	ui->ui_size = inode->i_size;
 	/* Truncation changes inode [mc]time */
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	/* Other attributes may be changed at the same time as well */
 	do_attr_changes(inode, attr);
 	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	mutex_lock(&ui->ui_mutex);
 	if (attr->ia_valid & ATTR_SIZE) {
 		/* Truncation changes inode [mc]time */
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		/* 'truncate_setsize()' changed @i_size, update @ui_size */
 		ui->ui_size = inode->i_size;
 	}
@@ -1365,9 +1365,9 @@ static inline int mctime_update_needed(const struct inode *inode,
 				       const struct timespec64 *now)
 {
 	struct timespec64 ctime = inode_get_ctime(inode);
+	struct timespec64 mtime = inode_get_mtime(inode);
 
-	if (!timespec64_equal(&inode->i_mtime, now) ||
-	    !timespec64_equal(&ctime, now))
+	if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now))
 		return 1;
 	return 0;
 }
@@ -1375,6 +1375,9 @@ static inline int mctime_update_needed(const struct inode *inode,
 /**
  * ubifs_update_time - update time of inode.
  * @inode: inode to update
+ * @time:  timespec structure to hold the current time value
+ * @flags: time updating control flag determines updating
+ *	    which time fields of @inode
  *
  * This function updates time of the inode.
  */
@@ -1429,7 +1432,7 @@ static int update_mctime(struct inode *inode)
 			return err;
 
 		mutex_lock(&ui->ui_mutex);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		release = ui->dirty;
 		mark_inode_dirty_sync(inode);
 		mutex_unlock(&ui->ui_mutex);
@@ -1567,7 +1570,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 		struct ubifs_inode *ui = ubifs_inode(inode);
 
 		mutex_lock(&ui->ui_mutex);
-		inode->i_mtime = inode_set_ctime_current(inode);
+		inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 		release = ui->dirty;
 		mark_inode_dirty_sync(inode);
 		mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index ffc9beee7be6..f0a5538c84b0 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
 	ino->ch.node_type = UBIFS_INO_NODE;
 	ino_key_init_flash(c, &ino->key, inode->i_ino);
 	ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
-	ino->atime_sec  = cpu_to_le64(inode->i_atime.tv_sec);
-	ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-	ino->ctime_sec  = cpu_to_le64(inode_get_ctime(inode).tv_sec);
-	ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
-	ino->mtime_sec  = cpu_to_le64(inode->i_mtime.tv_sec);
-	ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	ino->atime_sec  = cpu_to_le64(inode_get_atime_sec(inode));
+	ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+	ino->ctime_sec  = cpu_to_le64(inode_get_ctime_sec(inode));
+	ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+	ino->mtime_sec  = cpu_to_le64(inode_get_mtime_sec(inode));
+	ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
 	ino->uid   = cpu_to_le32(i_uid_read(inode));
 	ino->gid   = cpu_to_le32(i_gid_read(inode));
 	ino->mode  = cpu_to_le32(inode->i_mode);
@@ -1607,6 +1607,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 				ubifs_err(c, "bad data node (block %u, inode %lu)",
 					  blk, inode->i_ino);
 				ubifs_dump_node(c, dn, dn_size);
+				err = -EUCLEAN;
 				goto out_free;
 			}
 
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 4211e4456b1e..c59d47fe7939 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -23,7 +23,6 @@
 #include "ubifs.h"
 #include <linux/list_sort.h>
 #include <crypto/hash.h>
-#include <crypto/algapi.h>
 
 /**
  * struct replay_entry - replay list entry.
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b08fb28d16b5..09e270d6ed02 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers
 static struct kmem_cache *ubifs_inode_slab;
 
 /* UBIFS TNC shrinker description */
-static struct shrinker ubifs_shrinker_info = {
-	.scan_objects = ubifs_shrink_scan,
-	.count_objects = ubifs_shrink_count,
-	.seeks = DEFAULT_SEEKS,
-};
+static struct shrinker *ubifs_shrinker_info;
 
 /**
  * validate_inode - validate inode.
@@ -142,10 +138,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	set_nlink(inode, le32_to_cpu(ino->nlink));
 	i_uid_write(inode, le32_to_cpu(ino->uid));
 	i_gid_write(inode, le32_to_cpu(ino->gid));
-	inode->i_atime.tv_sec  = (int64_t)le64_to_cpu(ino->atime_sec);
-	inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
-	inode->i_mtime.tv_sec  = (int64_t)le64_to_cpu(ino->mtime_sec);
-	inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+	inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec),
+			le32_to_cpu(ino->atime_nsec));
+	inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec),
+			le32_to_cpu(ino->mtime_nsec));
 	inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
 			le32_to_cpu(ino->ctime_nsec));
 	inode->i_mode = le32_to_cpu(ino->mode);
@@ -923,8 +919,10 @@ static void free_buds(struct ubifs_info *c)
 {
 	struct ubifs_bud *bud, *n;
 
-	rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+	rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) {
+		kfree(bud->log_hash);
 		kfree(bud);
+	}
 }
 
 /**
@@ -1193,6 +1191,7 @@ static void destroy_journal(struct ubifs_info *c)
 
 		bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
 		list_del(&bud->list);
+		kfree(bud->log_hash);
 		kfree(bud);
 	}
 	ubifs_destroy_idx_gc(c);
@@ -2373,7 +2372,7 @@ static void inode_slab_ctor(void *obj)
 
 static int __init ubifs_init(void)
 {
-	int err;
+	int err = -ENOMEM;
 
 	BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
 
@@ -2439,10 +2438,15 @@ static int __init ubifs_init(void)
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
-	err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
-	if (err)
+	ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab");
+	if (!ubifs_shrinker_info)
 		goto out_slab;
 
+	ubifs_shrinker_info->count_objects = ubifs_shrink_count;
+	ubifs_shrinker_info->scan_objects = ubifs_shrink_scan;
+
+	shrinker_register(ubifs_shrinker_info);
+
 	err = ubifs_compressors_init();
 	if (err)
 		goto out_shrinker;
@@ -2467,7 +2471,7 @@ out_dbg:
 	dbg_debugfs_exit();
 	ubifs_compressors_exit();
 out_shrinker:
-	unregister_shrinker(&ubifs_shrinker_info);
+	shrinker_free(ubifs_shrinker_info);
 out_slab:
 	kmem_cache_destroy(ubifs_inode_slab);
 	return err;
@@ -2483,7 +2487,7 @@ static void __exit ubifs_exit(void)
 	dbg_debugfs_exit();
 	ubifs_sysfs_exit();
 	ubifs_compressors_exit();
-	unregister_shrinker(&ubifs_shrinker_info);
+	shrinker_free(ubifs_shrinker_info);
 
 	/*
 	 * Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6b7d95b65f4b..f4728e65d1bd 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -65,6 +65,7 @@ static void do_insert_old_idx(struct ubifs_info *c,
 		else {
 			ubifs_err(c, "old idx added twice!");
 			kfree(old_idx);
+			return;
 		}
 	}
 	rb_link_node(&old_idx->rb, parent, p);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ebb3ad6b5e7e..3916dc4f30ca 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -31,7 +31,7 @@
 #include <linux/completion.h>
 #include <crypto/hash_info.h>
 #include <crypto/hash.h>
-#include <crypto/algapi.h>
+#include <crypto/utils.h>
 
 #include <linux/fscrypt.h>
 
@@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
 			size_t size);
 
 #ifdef CONFIG_UBIFS_FS_XATTR
-extern const struct xattr_handler *ubifs_xattr_handlers[];
+extern const struct xattr_handler * const ubifs_xattr_handlers[];
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum);
 int ubifs_purge_xattrs(struct inode *host);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 406c82eab513..0847db521984 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = {
 };
 #endif
 
-const struct xattr_handler *ubifs_xattr_handlers[] = {
+const struct xattr_handler * const ubifs_xattr_handlers[] = {
 	&ubifs_user_xattr_handler,
 	&ubifs_trusted_xattr_handler,
 #ifdef CONFIG_UBIFS_FS_SECURITY
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index de17a97e8667..415b050b977d 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -471,7 +471,7 @@ struct fileIdentDesc {
 	uint8_t		lengthFileIdent;
 	struct long_ad	icb;
 	__le16		lengthOfImpUse;
-	uint8_t		impUse[];
+	/* uint8_t	impUse[]; */
 	/* uint8_t	fileIdent[]; */
 	/* uint8_t	padding[]; */
 } __packed;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6b558cbbeb6b..5f1f969f4134 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
 	else
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
-	iinfo->i_crtime = inode->i_mtime;
+	simple_inode_init_ts(inode);
+	iinfo->i_crtime = inode_get_mtime(inode);
 	if (unlikely(insert_inode_locked(inode) < 0)) {
 		make_bad_inode(inode);
 		iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a17a6184cc39..d8493449d4c5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1296,7 +1296,7 @@ set_size:
 			goto out_unlock;
 	}
 update_time:
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	if (IS_SYNC(inode))
 		udf_sync_inode(inode);
 	else
@@ -1327,7 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
 	int bs = inode->i_sb->s_blocksize;
 	int ret = -EIO;
 	uint32_t uid, gid;
-	struct timespec64 ctime;
+	struct timespec64 ts;
 
 reread:
 	if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1504,10 +1504,12 @@ reread:
 		inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
 			(inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
-		udf_disk_stamp_to_time(&ctime, fe->attrTime);
-		inode_set_ctime_to_ts(inode, ctime);
+		udf_disk_stamp_to_time(&ts, fe->accessTime);
+		inode_set_atime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, fe->modificationTime);
+		inode_set_mtime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, fe->attrTime);
+		inode_set_ctime_to_ts(inode, ts);
 
 		iinfo->i_unique = le64_to_cpu(fe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1519,11 +1521,13 @@ reread:
 		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
 		    (inode->i_sb->s_blocksize_bits - 9);
 
-		udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
-		udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
+		udf_disk_stamp_to_time(&ts, efe->accessTime);
+		inode_set_atime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, efe->modificationTime);
+		inode_set_mtime_to_ts(inode, ts);
+		udf_disk_stamp_to_time(&ts, efe->attrTime);
+		inode_set_ctime_to_ts(inode, ts);
 		udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
-		udf_disk_stamp_to_time(&ctime, efe->attrTime);
-		inode_set_ctime_to_ts(inode, ctime);
 
 		iinfo->i_unique = le64_to_cpu(efe->uniqueID);
 		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1798,8 +1802,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
 		fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
-		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode));
+		udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode));
 		udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
 		memset(&(fe->impIdent), 0, sizeof(struct regid));
 		strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1829,12 +1833,14 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 				cpu_to_le32(inode->i_sb->s_blocksize);
 		}
 
-		udf_adjust_time(iinfo, inode->i_atime);
-		udf_adjust_time(iinfo, inode->i_mtime);
+		udf_adjust_time(iinfo, inode_get_atime(inode));
+		udf_adjust_time(iinfo, inode_get_mtime(inode));
 		udf_adjust_time(iinfo, inode_get_ctime(inode));
 
-		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
-		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+		udf_time_to_disk_stamp(&efe->accessTime,
+				       inode_get_atime(inode));
+		udf_time_to_disk_stamp(&efe->modificationTime,
+				       inode_get_mtime(inode));
 		udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
 		udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae55ab8859b6..3508ac484da3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
 		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
 	udf_fiiter_write_fi(&iter, NULL);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	udf_fiiter_release(&iter);
 	udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	udf_fiiter_release(&iter);
 	udf_add_fid_counter(dir->i_sb, true, 1);
 	inc_nlink(dir);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	d_instantiate_new(dentry, inode);
 
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_size = 0;
 	inode_dec_link_count(dir);
 	udf_add_fid_counter(dir->i_sb, true, -1);
-	dir->i_mtime = inode_set_ctime_to_ts(dir,
-					     inode_set_ctime_current(inode));
+	inode_set_mtime_to_ts(dir,
+			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
 	mark_inode_dirty(dir);
 	ret = 0;
 end_rmdir:
@@ -555,7 +555,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 		set_nlink(inode, 1);
 	}
 	udf_fiiter_delete_entry(&iter);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	inode_dec_link_count(inode);
 	udf_add_fid_counter(dir->i_sb, false, -1);
@@ -748,7 +748,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	udf_add_fid_counter(dir->i_sb, false, 1);
 	inode_set_ctime_current(inode);
 	mark_inode_dirty(inode);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ihold(inode);
 	d_instantiate(dentry, inode);
@@ -866,8 +866,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
 				    -1);
 	}
-	old_dir->i_mtime = inode_set_ctime_current(old_dir);
-	new_dir->i_mtime = inode_set_ctime_current(new_dir);
+	inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
+	inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
 	mark_inode_dirty(old_dir);
 	mark_inode_dirty(new_dir);
 
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 9af6ff7f9747..f9a60bc1abcf 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -86,7 +86,7 @@ struct udf_virtual_data {
 struct udf_bitmap {
 	__u32			s_extPosition;
 	int			s_nr_groups;
-	struct buffer_head	*s_block_bitmap[];
+	struct buffer_head	*s_block_bitmap[] __counted_by(s_nr_groups);
 };
 
 struct udf_part_map {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 2436e3f82147..53c11be2b2c1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 			       unsigned int count, sector_t oldb,
 			       sector_t newb, struct page *locked_page)
 {
+	struct folio *folio, *locked_folio = page_folio(locked_page);
 	const unsigned blks_per_page =
 		1 << (PAGE_SHIFT - inode->i_blkbits);
 	const unsigned mask = blks_per_page - 1;
@@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 	pgoff_t index, cur_index, last_index;
 	unsigned pos, j, lblock;
 	sector_t end, i;
-	struct page *page;
 	struct buffer_head *head, *bh;
 
 	UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n",
 	      inode->i_ino, count,
 	     (unsigned long long)oldb, (unsigned long long)newb);
 
-	BUG_ON(!locked_page);
-	BUG_ON(!PageLocked(locked_page));
+	BUG_ON(!folio_test_locked(locked_folio));
 
-	cur_index = locked_page->index;
+	cur_index = locked_folio->index;
 	end = count + beg;
 	last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
 	for (i = beg; i < end; i = (i | mask) + 1) {
 		index = i >> (PAGE_SHIFT - inode->i_blkbits);
 
 		if (likely(cur_index != index)) {
-			page = ufs_get_locked_page(mapping, index);
-			if (!page)/* it was truncated */
+			folio = ufs_get_locked_folio(mapping, index);
+			if (!folio) /* it was truncated */
 				continue;
-			if (IS_ERR(page)) {/* or EIO */
+			if (IS_ERR(folio)) {/* or EIO */
 				ufs_error(inode->i_sb, __func__,
 					  "read of page %llu failed\n",
 					  (unsigned long long)index);
 				continue;
 			}
 		} else
-			page = locked_page;
+			folio = locked_folio;
 
-		head = page_buffers(page);
+		head = folio_buffers(folio);
 		bh = head;
 		pos = i & mask;
 		for (j = 0; j < pos; ++j)
 			bh = bh->b_this_page;
 
-
 		if (unlikely(index == last_index))
 			lblock = end & mask;
 		else
@@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 		} while (bh != head);
 
 		if (likely(cur_index != index))
-			ufs_put_locked_page(page);
+			ufs_put_locked_folio(folio);
  	}
 	UFSD("EXIT\n");
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index fd57f03b6c93..27c85d92d1dc 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 	ufs_commit_chunk(page, pos, len);
 	ufs_put_page(page);
 	if (update_times)
-		dir->i_mtime = inode_set_ctime_current(dir);
+		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	mark_inode_dirty(dir);
 	ufs_handle_dirsync(dir);
 }
@@ -397,7 +397,7 @@ got_it:
 	ufs_set_de_type(sb, de, inode->i_mode);
 
 	ufs_commit_chunk(page, pos, rec_len);
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 
 	mark_inode_dirty(dir);
 	err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 		pde->d_reclen = cpu_to_fs16(sb, to - from);
 	dir->d_ino = 0;
 	ufs_commit_chunk(page, pos, to - from);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 	err = ufs_handle_dirsync(inode);
 out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a1e7bd9d1f98..73531827ecee 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_blocks = 0;
 	inode->i_generation = 0;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	ufsi->i_flags = UFS_I(dir)->i_flags;
 	ufsi->i_lastfrag = 0;
 	ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 21a4779a2de5..ebce93b08281 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -579,13 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
 
 	inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
-	inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+	inode_set_atime(inode,
+			(signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec),
+			0);
 	inode_set_ctime(inode,
 			(signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
 			0);
-	inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_nsec = 0;
+	inode_set_mtime(inode,
+			(signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec),
+			0);
 	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
 	inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
 	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
 
 	inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
-	inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+	inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime),
+			fs32_to_cpu(sb, ufs2_inode->ui_atimensec));
 	inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
 			fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
-	inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
-	inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
-	inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+	inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime),
+			fs32_to_cpu(sb, ufs2_inode->ui_mtimensec));
 	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
 	inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
 	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
@@ -725,12 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
 
 	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
-	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+	ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb,
+						 inode_get_atime_sec(inode));
 	ufs_inode->ui_atime.tv_usec = 0;
 	ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
-						 inode_get_ctime(inode).tv_sec);
+						 inode_get_ctime_sec(inode));
 	ufs_inode->ui_ctime.tv_usec = 0;
-	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+	ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb,
+						 inode_get_mtime_sec(inode));
 	ufs_inode->ui_mtime.tv_usec = 0;
 	ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
 	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -770,13 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 	ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
 
 	ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
-	ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
-	ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
-	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+	ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode));
+	ufs_inode->ui_atimensec = cpu_to_fs32(sb,
+					      inode_get_atime_nsec(inode));
+	ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode));
 	ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
-					      inode_get_ctime(inode).tv_nsec);
-	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
-	ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+					      inode_get_ctime_nsec(inode));
+	ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode));
+	ufs_inode->ui_mtimensec = cpu_to_fs32(sb,
+					      inode_get_mtime_nsec(inode));
 
 	ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
 	ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -1057,7 +1063,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
 	unsigned i, end;
 	sector_t lastfrag;
-	struct page *lastpage;
+	struct folio *folio;
 	struct buffer_head *bh;
 	u64 phys64;
 
@@ -1068,18 +1074,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 
 	lastfrag--;
 
-	lastpage = ufs_get_locked_page(mapping, lastfrag >>
+	folio = ufs_get_locked_folio(mapping, lastfrag >>
 				       (PAGE_SHIFT - inode->i_blkbits));
-       if (IS_ERR(lastpage)) {
-               err = -EIO;
-               goto out;
-       }
-
-       end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
-       bh = page_buffers(lastpage);
-       for (i = 0; i < end; ++i)
-               bh = bh->b_this_page;
+	if (IS_ERR(folio)) {
+		err = -EIO;
+		goto out;
+	}
 
+	end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
+	bh = folio_buffers(folio);
+	for (i = 0; i < end; ++i)
+		bh = bh->b_this_page;
 
        err = ufs_getfrag_block(inode, lastfrag, bh, 1);
 
@@ -1095,7 +1100,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 		*/
 	       set_buffer_uptodate(bh);
 	       mark_buffer_dirty(bh);
-	       set_page_dirty(lastpage);
+		folio_mark_dirty(folio);
        }
 
        if (lastfrag >= UFS_IND_FRAGMENT) {
@@ -1113,7 +1118,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 	       }
        }
 out_unlock:
-       ufs_put_locked_page(lastpage);
+       ufs_put_locked_folio(folio);
 out:
        return err;
 }
@@ -1208,7 +1213,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
 	truncate_setsize(inode, size);
 
 	ufs_truncate_blocks(inode);
-	inode->i_mtime = inode_set_ctime_current(inode);
+	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
 	mark_inode_dirty(inode);
 out:
 	UFSD("EXIT: err %d\n", err);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 23377c1baed9..a480810cd4e3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
 }
 
 static const struct export_operations ufs_export_ops = {
+	.encode_fh = generic_encode_ino32_fh,
 	.fh_to_dentry	= ufs_fh_to_dentry,
 	.fh_to_parent	= ufs_fh_to_parent,
 	.get_parent	= ufs_get_parent,
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 08ddf41eaaad..2acf191eb89e 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -230,42 +230,40 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
 }
 
 /**
- * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist
  * read it from disk.
  * @mapping: the address_space to search
  * @index: the page index
  *
- * Locates the desired pagecache page, if not exist we'll read it,
+ * Locates the desired pagecache folio, if not exist we'll read it,
  * locks it, increments its reference
  * count and returns its address.
  *
  */
-
-struct page *ufs_get_locked_page(struct address_space *mapping,
+struct folio *ufs_get_locked_folio(struct address_space *mapping,
 				 pgoff_t index)
 {
 	struct inode *inode = mapping->host;
-	struct page *page = find_lock_page(mapping, index);
-	if (!page) {
-		page = read_mapping_page(mapping, index, NULL);
+	struct folio *folio = filemap_lock_folio(mapping, index);
+	if (IS_ERR(folio)) {
+		folio = read_mapping_folio(mapping, index, NULL);
 
-		if (IS_ERR(page)) {
-			printk(KERN_ERR "ufs_change_blocknr: "
-			       "read_mapping_page error: ino %lu, index: %lu\n",
+		if (IS_ERR(folio)) {
+			printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n",
 			       mapping->host->i_ino, index);
-			return page;
+			return folio;
 		}
 
-		lock_page(page);
+		folio_lock(folio);
 
-		if (unlikely(page->mapping == NULL)) {
+		if (unlikely(folio->mapping == NULL)) {
 			/* Truncate got there first */
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			return NULL;
 		}
 	}
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-	return page;
+	if (!folio_buffers(folio))
+		create_empty_buffers(folio, 1 << inode->i_blkbits, 0);
+	return folio;
 }
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 89247193d96d..0ecd2ed792f5 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -273,15 +273,13 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc
 extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
 
 /* This functions works with cache pages*/
-extern struct page *ufs_get_locked_page(struct address_space *mapping,
-					pgoff_t index);
-static inline void ufs_put_locked_page(struct page *page)
+struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
+static inline void ufs_put_locked_folio(struct folio *folio)
 {
-       unlock_page(page);
-       put_page(page);
+       folio_unlock(folio);
+       folio_put(folio);
 }
 
-
 /*
  * macros and inline function to get important structures from ufs_sb_private_info
  */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..e8af40b05549 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = {
 };
 #endif
 
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
@@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
 	return ctx->features & UFFD_FEATURE_INITIALIZED;
 }
 
+static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+	return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+}
+
 /*
  * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
  * meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -922,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 			continue;
 		}
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
-		prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
-				 new_flags, vma->anon_vma,
-				 vma->vm_file, vma->vm_pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-		if (prev) {
-			vma = prev;
-		} else {
-			prev = vma;
-		}
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+					    vma->vm_end, new_flags,
+					    NULL_VM_UFFD_CTX);
 
 		vma_start_write(vma);
 		userfaultfd_set_vm_flags(vma, new_flags);
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+		prev = vma;
 	}
 	mmap_write_unlock(mm);
 	mmput(mm);
@@ -1325,7 +1325,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	bool basic_ioctls;
 	unsigned long start, end, vma_end;
 	struct vma_iterator vmi;
-	pgoff_t pgoff;
+	bool wp_async = userfaultfd_wp_async_ctx(ctx);
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
 
@@ -1399,7 +1399,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_can_userfault(cur, vm_flags))
+		if (!vma_can_userfault(cur, vm_flags, wp_async))
 			goto out_unlock;
 
 		/*
@@ -1460,7 +1460,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	for_each_vma_range(vmi, vma, end) {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma, vm_flags));
+		BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1478,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		vma_end = min(end, vma->vm_end);
 
 		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
-		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, pgoff,
-				 vma_policy(vma),
-				 ((struct vm_userfaultfd_ctx){ ctx }),
-				 anon_vma_name(vma));
-		if (prev) {
-			/* vma_merge() invalidated the mas */
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+					    new_flags,
+					    (struct vm_userfaultfd_ctx){ctx});
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
+			break;
 		}
-	next:
+
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and
@@ -1561,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	unsigned long start, end, vma_end;
 	const void __user *buf = (void __user *)arg;
 	struct vma_iterator vmi;
-	pgoff_t pgoff;
+	bool wp_async = userfaultfd_wp_async_ctx(ctx);
 
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1615,7 +1601,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_can_userfault(cur, cur->vm_flags))
+		if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
 			goto out_unlock;
 
 		found = true;
@@ -1631,7 +1617,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	for_each_vma_range(vmi, vma, end) {
 		cond_resched();
 
-		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+		BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -1664,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			uffd_wp_range(vma, start, vma_end - start, false);
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
-		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
-				 vma->anon_vma, vma->vm_file, pgoff,
-				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-		if (prev) {
-			vma = prev;
-			goto next;
-		}
-		if (vma->vm_start < start) {
-			ret = split_vma(&vmi, vma, start, 1);
-			if (ret)
-				break;
-		}
-		if (vma->vm_end > end) {
-			ret = split_vma(&vmi, vma, end, 0);
-			if (ret)
-				break;
+		vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+					    new_flags, NULL_VM_UFFD_CTX);
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
+			break;
 		}
-	next:
+
 		/*
 		 * In the vma_merge() successful mprotect-like case 8:
 		 * the next vma was merged into the current one and
@@ -2018,6 +1991,11 @@ out:
 	return ret;
 }
 
+bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+	return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -2051,6 +2029,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	ret = -EPERM;
 	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
 		goto err_out;
+
+	/* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+	if (features & UFFD_FEATURE_WP_ASYNC)
+		features |= UFFD_FEATURE_WP_UNPOPULATED;
+
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2063,6 +2046,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 #ifndef CONFIG_PTE_MARKER_UFFD_WP
 	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
 	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 83f20dd15522..72ac9320e6a3 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
 	do_div(allocated, 512);
 	inode->i_blocks = allocated;
 
-	inode->i_atime = ns_to_timespec64(
-				 info->access_time.ns_relative_to_unix_epoch);
+	inode_set_atime_to_ts(inode,
+			      ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch));
 	inode_set_ctime_to_ts(inode,
 			      ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
-	inode->i_mtime = ns_to_timespec64(
-			   info->modification_time.ns_relative_to_unix_epoch);
+	inode_set_mtime_to_ts(inode,
+			      ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch));
 	return 0;
 }
 
@@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	struct vboxsf_sbi *sbi;
 	struct vboxsf_inode *sf_i;
 	struct shfl_fsobjinfo info;
-	struct timespec64 prev_mtime;
+	struct timespec64 mtime, prev_mtime;
 	struct inode *inode;
 	int err;
 
@@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 		return -EINVAL;
 
 	inode = d_inode(dentry);
-	prev_mtime = inode->i_mtime;
+	prev_mtime = inode_get_mtime(inode);
 	sf_i = VBOXSF_I(inode);
 	sbi = VBOXSF_SBI(dentry->d_sb);
 	if (!sf_i->force_restat) {
@@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
 	 * page-cache for it.  Note this also gets triggered by our own writes,
 	 * this is unavoidable.
 	 */
-	if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0)
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&mtime, &prev_mtime) > 0)
 		invalidate_inode_pages2(inode->i_mapping);
 
 	return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index efd4736bc94b..09d927603433 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
 static const struct xattr_handler *
 xattr_resolve_name(struct inode *inode, const char **name)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
 int
 xattr_supports_user_prefix(struct inode *inode)
 {
-	const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+	const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
 	const struct xattr_handler *handler;
 
 	if (!(inode->i_opflags & IOP_XATTR)) {
@@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+	const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
 	ssize_t remaining_size = buffer_size;
 	int err = 0;
 
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index ed0bc8cbc703..567fb37274d3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
 	bool "XFS online metadata check usage data collection"
 	default y
 	depends on XFS_ONLINE_SCRUB
-	select XFS_DEBUG
+	select DEBUG_FS
 	help
 	  If you say Y here, the kernel will gather usage data about
 	  the online metadata check subsystem.  This includes the number
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3069194527dd..100ab5931b31 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
 
 	ASSERT(mp->m_alloc_maxlevels > 0);
 
+	/*
+	 * For a btree shorter than the maximum height, the worst case is that
+	 * every level gets split and a new level is added, then while inserting
+	 * another entry to refill the AGFL, every level under the old root gets
+	 * split again. This is:
+	 *
+	 *   (full height split reservation) + (AGFL refill split height)
+	 * = (current height + 1) + (current height - 1)
+	 * = (new height) + (new height - 2)
+	 * = 2 * new height - 2
+	 *
+	 * For a btree of maximum height, the worst case is that every level
+	 * under the root gets split, then while inserting another entry to
+	 * refill the AGFL, every level under the root gets split again. This is
+	 * also:
+	 *
+	 *   2 * (current height - 1)
+	 * = 2 * (new height - 1)
+	 * = 2 * new height - 2
+	 */
+
 	/* space needed by-bno freespace btree */
 	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
-				       mp->m_alloc_maxlevels);
+				       mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed by-size freespace btree */
 	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
-				       mp->m_alloc_maxlevels);
+				       mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed reverse mapping used space btree */
 	if (xfs_has_rmapbt(mp))
 		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
-						mp->m_rmap_maxlevels);
+						mp->m_rmap_maxlevels) * 2 - 2;
 
 	return min_free;
 }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 30c931b38853..be62acffad6c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -21,7 +21,7 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
@@ -2989,7 +2989,7 @@ xfs_bmap_extsize_align(
 	 * If realtime, and the result isn't a multiple of the realtime
 	 * extent size we need to remove blocks until it is.
 	 */
-	if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+	if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) {
 		/*
 		 * We're not covering the original request, or
 		 * we won't be able to once we fix the length.
@@ -3016,7 +3016,7 @@ xfs_bmap_extsize_align(
 		else {
 			align_alen -= orig_off - align_off;
 			align_off = orig_off;
-			align_alen -= align_alen % mp->m_sb.sb_rextsize;
+			align_alen -= xfs_extlen_to_rtxmod(mp, align_alen);
 		}
 		/*
 		 * Result doesn't cover the request, fail it.
@@ -4826,12 +4826,8 @@ xfs_bmap_del_extent_delay(
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got_endoff >= del_endoff);
 
-	if (isrt) {
-		uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
-
-		do_div(rtexts, mp->m_sb.sb_rextsize);
-		xfs_mod_frextents(mp, rtexts);
-	}
+	if (isrt)
+		xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
 
 	/*
 	 * Update the inode delalloc counter now and wait to update the
@@ -5057,33 +5053,20 @@ xfs_bmap_del_extent_real(
 
 	flags = XFS_ILOG_CORE;
 	if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
-		xfs_filblks_t	len;
-		xfs_extlen_t	mod;
-
-		len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize,
-				  &mod);
-		ASSERT(mod == 0);
-
 		if (!(bflags & XFS_BMAPI_REMAP)) {
-			xfs_fsblock_t	bno;
-
-			bno = div_u64_rem(del->br_startblock,
-					mp->m_sb.sb_rextsize, &mod);
-			ASSERT(mod == 0);
-
-			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+			error = xfs_rtfree_blocks(tp, del->br_startblock,
+					del->br_blockcount);
 			if (error)
 				goto done;
 		}
 
 		do_fx = 0;
-		nblks = len * mp->m_sb.sb_rextsize;
 		qfield = XFS_TRANS_DQ_RTBCOUNT;
 	} else {
 		do_fx = 1;
-		nblks = del->br_blockcount;
 		qfield = XFS_TRANS_DQ_BCOUNT;
 	}
+	nblks = del->br_blockcount;
 
 	del_endblock = del->br_startblock + del->br_blockcount;
 	if (cur) {
@@ -5289,7 +5272,6 @@ __xfs_bunmapi(
 	int			tmp_logflags;	/* partial logging flags */
 	int			wasdel;		/* was a delayed alloc extent */
 	int			whichfork;	/* data or attribute fork */
-	xfs_fsblock_t		sum;
 	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
 	xfs_fileoff_t		end;
 	struct xfs_iext_cursor	icur;
@@ -5384,8 +5366,8 @@ __xfs_bunmapi(
 		if (!isrt)
 			goto delete;
 
-		sum = del.br_startblock + del.br_blockcount;
-		div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod);
+		mod = xfs_rtb_to_rtxoff(mp,
+				del.br_startblock + del.br_blockcount);
 		if (mod) {
 			/*
 			 * Realtime extent not lined up at the end.
@@ -5432,7 +5414,8 @@ __xfs_bunmapi(
 				goto error0;
 			goto nodelete;
 		}
-		div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod);
+
+		mod = xfs_rtb_to_rtxoff(mp, del.br_startblock);
 		if (mod) {
 			xfs_extlen_t off = mp->m_sb.sb_rextsize - mod;
 
@@ -6209,8 +6192,8 @@ xfs_bmap_validate_extent(
 		return __this_address;
 
 	if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
-		if (!xfs_verify_rtext(mp, irec->br_startblock,
-					  irec->br_blockcount))
+		if (!xfs_verify_rtbext(mp, irec->br_startblock,
+					   irec->br_blockcount))
 			return __this_address;
 	} else {
 		if (!xfs_verify_fsbext(mp, irec->br_startblock,
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index bcfb6a4203cd..f71679ce23b9 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
 	return ret;
 }
 
-/* Abort all the intents that were committed. */
 STATIC void
-xfs_defer_trans_abort(
-	struct xfs_trans		*tp,
-	struct list_head		*dop_pending)
+xfs_defer_pending_abort(
+	struct xfs_mount		*mp,
+	struct list_head		*dop_list)
 {
 	struct xfs_defer_pending	*dfp;
 	const struct xfs_defer_op_type	*ops;
 
-	trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
 	/* Abort intent items that don't have a done item. */
-	list_for_each_entry(dfp, dop_pending, dfp_list) {
+	list_for_each_entry(dfp, dop_list, dfp_list) {
 		ops = defer_op_types[dfp->dfp_type];
-		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+		trace_xfs_defer_pending_abort(mp, dfp);
 		if (dfp->dfp_intent && !dfp->dfp_done) {
 			ops->abort_intent(dfp->dfp_intent);
 			dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
 	}
 }
 
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+	struct xfs_trans		*tp,
+	struct list_head		*dop_pending)
+{
+	trace_xfs_defer_trans_abort(tp, _RET_IP_);
+	xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
 /*
  * Capture resources that the caller said not to release ("held") when the
  * transaction commits.  Caller is responsible for zero-initializing @dres.
@@ -756,12 +763,13 @@ xfs_defer_ops_capture(
 
 /* Release all resources that we used to capture deferred ops. */
 void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
 	struct xfs_mount		*mp,
 	struct xfs_defer_capture	*dfc)
 {
 	unsigned short			i;
 
+	xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
 
 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
 	/* Commit the transaction and add the capture structure to the list. */
 	error = xfs_trans_commit(tp);
 	if (error) {
-		xfs_defer_ops_capture_free(mp, dfc);
+		xfs_defer_ops_capture_abort(mp, dfc);
 		return error;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 114a3a4930a3..8788ad5f6a73 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
 		struct list_head *capture_list);
 void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
 		struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
 		struct xfs_defer_capture *d);
 void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
 
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 371dc07233e0..9a88aba1589f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,7 +98,7 @@ typedef struct xfs_sb {
 	uint32_t	sb_blocksize;	/* logical block size, bytes */
 	xfs_rfsblock_t	sb_dblocks;	/* number of data blocks */
 	xfs_rfsblock_t	sb_rblocks;	/* number of realtime blocks */
-	xfs_rtblock_t	sb_rextents;	/* number of realtime extents */
+	xfs_rtbxlen_t	sb_rextents;	/* number of realtime extents */
 	uuid_t		sb_uuid;	/* user-visible file system unique id */
 	xfs_fsblock_t	sb_logstart;	/* starting block of log if internal */
 	xfs_ino_t	sb_rootino;	/* root inode number */
@@ -691,6 +691,22 @@ struct xfs_agfl {
 		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 
 /*
+ * Realtime bitmap information is accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_rtword_raw {
+	__u32		old;
+};
+
+/*
+ * Realtime summary counts are accessed by the word, which is currently
+ * stored in host-endian format.
+ */
+union xfs_suminfo_raw {
+	__u32		old;
+};
+
+/*
  * XFS Timestamps
  * ==============
  *
@@ -1142,24 +1158,10 @@ static inline bool xfs_dinode_has_large_extent_counts(
 
 #define	XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
 #define	XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
-#define	XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
-#define	XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
 
 /*
- * RT Summary and bit manipulation macros.
+ * RT bit manipulation macros.
  */
-#define	XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define	XFS_SUMOFFSTOBLOCK(mp,s)	\
-	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define	XFS_SUMPTR(mp,bp,so)	\
-	((xfs_suminfo_t *)((bp)->b_addr + \
-		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define	XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
-#define	XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
-#define	XFS_BITTOWORD(mp,bi)	\
-	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
 #define	XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
 #define	XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index a35781577cad..137a65bda95d 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -220,8 +220,10 @@ xfs_inode_from_disk(
 	 * a time before epoch is converted to a time long after epoch
 	 * on 64 bit systems.
 	 */
-	inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
-	inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+	inode_set_atime_to_ts(inode,
+			      xfs_inode_from_disk_ts(from, from->di_atime));
+	inode_set_mtime_to_ts(inode,
+			      xfs_inode_from_disk_ts(from, from->di_mtime));
 	inode_set_ctime_to_ts(inode,
 			      xfs_inode_from_disk_ts(from, from->di_ctime));
 
@@ -315,8 +317,8 @@ xfs_inode_to_disk(
 	to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
 	to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
 
-	to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
-	to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+	to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode));
+	to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode));
 	to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
 	to->di_nlink = cpu_to_be32(inode->i_nlink);
 	to->di_gen = cpu_to_be32(inode->i_generation);
@@ -508,6 +510,9 @@ xfs_dinode_verify(
 	if (mode && nextents + naextents > nblocks)
 		return __this_address;
 
+	if (nextents + naextents == 0 && nblocks != 0)
+		return __this_address;
+
 	if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
 		return __this_address;
 
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index fa180ab66b73..c269d704314d 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -16,6 +16,7 @@
 #include "xfs_trans.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Realtime allocator bitmap functions shared with userspace.
@@ -46,25 +47,69 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 	.verify_write = xfs_rtbuf_verify_write,
 };
 
+/* Release cached rt bitmap and summary buffers. */
+void
+xfs_rtbuf_cache_relse(
+	struct xfs_rtalloc_args	*args)
+{
+	if (args->rbmbp) {
+		xfs_trans_brelse(args->tp, args->rbmbp);
+		args->rbmbp = NULL;
+		args->rbmoff = NULLFILEOFF;
+	}
+	if (args->sumbp) {
+		xfs_trans_brelse(args->tp, args->sumbp);
+		args->sumbp = NULL;
+		args->sumoff = NULLFILEOFF;
+	}
+}
+
 /*
  * Get a buffer for the bitmap or summary file block specified.
  * The buffer is returned read and locked.
  */
 int
 xfs_rtbuf_get(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	block,		/* block number in bitmap or summary */
-	int		issum,		/* is summary not bitmap */
-	struct xfs_buf	**bpp)		/* output: buffer for the block */
+	struct xfs_rtalloc_args	*args,
+	xfs_fileoff_t		block,	/* block number in bitmap or summary */
+	int			issum)	/* is summary not bitmap */
 {
-	struct xfs_buf	*bp;		/* block buffer, result */
-	xfs_inode_t	*ip;		/* bitmap or summary inode */
-	xfs_bmbt_irec_t	map;
-	int		nmap = 1;
-	int		error;		/* error value */
+	struct xfs_mount	*mp = args->mp;
+	struct xfs_buf		**cbpp;	/* cached block buffer */
+	xfs_fileoff_t		*coffp;	/* cached block number */
+	struct xfs_buf		*bp;	/* block buffer, result */
+	struct xfs_inode	*ip;	/* bitmap or summary inode */
+	struct xfs_bmbt_irec	map;
+	enum xfs_blft		type;
+	int			nmap = 1;
+	int			error;
 
-	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+	if (issum) {
+		cbpp = &args->sumbp;
+		coffp = &args->sumoff;
+		ip = mp->m_rsumip;
+		type = XFS_BLFT_RTSUMMARY_BUF;
+	} else {
+		cbpp = &args->rbmbp;
+		coffp = &args->rbmoff;
+		ip = mp->m_rbmip;
+		type = XFS_BLFT_RTBITMAP_BUF;
+	}
+
+	/*
+	 * If we have a cached buffer, and the block number matches, use that.
+	 */
+	if (*cbpp && *coffp == block)
+		return 0;
+
+	/*
+	 * Otherwise we have to have to get the buffer.  If there was an old
+	 * one, get rid of it first.
+	 */
+	if (*cbpp) {
+		xfs_trans_brelse(args->tp, *cbpp);
+		*cbpp = NULL;
+	}
 
 	error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
 	if (error)
@@ -74,15 +119,15 @@ xfs_rtbuf_get(
 		return -EFSCORRUPTED;
 
 	ASSERT(map.br_startblock != NULLFSBLOCK);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+	error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
 				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
 	if (error)
 		return error;
 
-	xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
-					     : XFS_BLFT_RTBITMAP_BUF);
-	*bpp = bp;
+	xfs_trans_buf_set_type(args->tp, bp, type);
+	*cbpp = bp;
+	*coffp = block;
 	return 0;
 }
 
@@ -92,47 +137,44 @@ xfs_rtbuf_get(
  */
 int
 xfs_rtfind_back(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext to look at */
+	xfs_rtxnum_t		limit,	/* last rtext to look at */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext found */
 {
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	struct xfs_buf	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
+	struct xfs_mount	*mp = args->mp;
+	int			bit;	/* bit number in the word */
+	xfs_fileoff_t		block;	/* bitmap block number */
+	int			error;	/* error value */
+	xfs_rtxnum_t		firstbit; /* first useful bit in the word */
+	xfs_rtxnum_t		i;	/* current bit number rel. to start */
+	xfs_rtxnum_t		len;	/* length of inspected area */
+	xfs_rtword_t		mask;	/* mask of relevant bits for value */
+	xfs_rtword_t		want;	/* mask for "good" values */
+	xfs_rtword_t		wdiff;	/* difference from wanted value */
+	xfs_rtword_t		incore;
+	unsigned int		word;	/* word number in the buffer */
 
 	/*
 	 * Compute and read in starting bitmap block for starting block.
 	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
+	block = xfs_rtx_to_rbmblock(mp, start);
+	error = xfs_rtbitmap_read_buf(args, block);
+	if (error)
 		return error;
-	}
-	bufp = bp->b_addr;
+
 	/*
 	 * Get the first word's index & point to it.
 	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
+	word = xfs_rtx_to_rbmword(mp, start);
 	bit = (int)(start & (XFS_NBWORD - 1));
 	len = start - limit + 1;
 	/*
 	 * Compute match value, based on the bit at start: if 1 (free)
 	 * then all-ones, else all-zeroes.
 	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	incore = xfs_rtbitmap_getword(args, word);
+	want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
 	/*
 	 * If the starting position is not word-aligned, deal with the
 	 * partial word.
@@ -149,13 +191,12 @@ xfs_rtfind_back(
 		 * Calculate the difference between the value there
 		 * and what we're looking for.
 		 */
-		if ((wdiff = (*b ^ want) & mask)) {
+		if ((wdiff = (incore ^ want) & mask)) {
 			/*
 			 * Different.  Mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i = bit - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
+			*rtx = start - i + 1;
 			return 0;
 		}
 		i = bit - firstbit + 1;
@@ -167,19 +208,11 @@ xfs_rtfind_back(
 			/*
 			 * If done with this block, get the previous one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, --block);
+			if (error)
 				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
+
+			word = mp->m_blockwsize - 1;
 		}
 	} else {
 		/*
@@ -195,13 +228,13 @@ xfs_rtfind_back(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = *b ^ want)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = incore ^ want)) {
 			/*
 			 * Different, mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
+			*rtx = start - i + 1;
 			return 0;
 		}
 		i += XFS_NBWORD;
@@ -213,19 +246,11 @@ xfs_rtfind_back(
 			/*
 			 * If done with this block, get the previous one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, --block);
+			if (error)
 				return error;
-			}
-			bufp = bp->b_addr;
-			word = XFS_BLOCKWMASK(mp);
-			b = &bufp[word];
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b--;
+
+			word = mp->m_blockwsize - 1;
 		}
 	}
 	/*
@@ -242,13 +267,13 @@ xfs_rtfind_back(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = (*b ^ want) & mask)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = (incore ^ want) & mask)) {
 			/*
 			 * Different, mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
-			*rtblock = start - i + 1;
+			*rtx = start - i + 1;
 			return 0;
 		} else
 			i = len;
@@ -256,8 +281,7 @@ xfs_rtfind_back(
 	/*
 	 * No match, return that we scanned the whole area.
 	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start - i + 1;
+	*rtx = start - i + 1;
 	return 0;
 }
 
@@ -267,47 +291,44 @@ xfs_rtfind_back(
  */
 int
 xfs_rtfind_forw(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to look at */
-	xfs_rtblock_t	limit,		/* last block to look at */
-	xfs_rtblock_t	*rtblock)	/* out: start block found */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext to look at */
+	xfs_rtxnum_t		limit,	/* last rtext to look at */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext found */
 {
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	struct xfs_buf	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
-	xfs_rtblock_t	len;		/* length of inspected area */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	want;		/* mask for "good" values */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
+	struct xfs_mount	*mp = args->mp;
+	int			bit;	/* bit number in the word */
+	xfs_fileoff_t		block;	/* bitmap block number */
+	int			error;
+	xfs_rtxnum_t		i;	/* current bit number rel. to start */
+	xfs_rtxnum_t		lastbit;/* last useful bit in the word */
+	xfs_rtxnum_t		len;	/* length of inspected area */
+	xfs_rtword_t		mask;	/* mask of relevant bits for value */
+	xfs_rtword_t		want;	/* mask for "good" values */
+	xfs_rtword_t		wdiff;	/* difference from wanted value */
+	xfs_rtword_t		incore;
+	unsigned int		word;	/* word number in the buffer */
 
 	/*
 	 * Compute and read in starting bitmap block for starting block.
 	 */
-	block = XFS_BITTOBLOCK(mp, start);
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
+	block = xfs_rtx_to_rbmblock(mp, start);
+	error = xfs_rtbitmap_read_buf(args, block);
+	if (error)
 		return error;
-	}
-	bufp = bp->b_addr;
+
 	/*
 	 * Get the first word's index & point to it.
 	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
+	word = xfs_rtx_to_rbmword(mp, start);
 	bit = (int)(start & (XFS_NBWORD - 1));
 	len = limit - start + 1;
 	/*
 	 * Compute match value, based on the bit at start: if 1 (free)
 	 * then all-ones, else all-zeroes.
 	 */
-	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	incore = xfs_rtbitmap_getword(args, word);
+	want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
 	/*
 	 * If the starting position is not word-aligned, deal with the
 	 * partial word.
@@ -323,13 +344,12 @@ xfs_rtfind_forw(
 		 * Calculate the difference between the value there
 		 * and what we're looking for.
 		 */
-		if ((wdiff = (*b ^ want) & mask)) {
+		if ((wdiff = (incore ^ want) & mask)) {
 			/*
 			 * Different.  Mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i = XFS_RTLOBIT(wdiff) - bit;
-			*rtblock = start + i - 1;
+			*rtx = start + i - 1;
 			return 0;
 		}
 		i = lastbit - bit;
@@ -337,22 +357,15 @@ xfs_rtfind_forw(
 		 * Go on to next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * If done with this block, get the previous one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			b = bufp = bp->b_addr;
+
 			word = 0;
-		} else {
-			/*
-			 * Go on to the previous word in the buffer.
-			 */
-			b++;
 		}
 	} else {
 		/*
@@ -368,13 +381,13 @@ xfs_rtfind_forw(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = *b ^ want)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = incore ^ want)) {
 			/*
 			 * Different, mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
+			*rtx = start + i - 1;
 			return 0;
 		}
 		i += XFS_NBWORD;
@@ -382,22 +395,15 @@ xfs_rtfind_forw(
 		 * Go on to next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * If done with this block, get the next one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			b = bufp = bp->b_addr;
+
 			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
 		}
 	}
 	/*
@@ -412,13 +418,13 @@ xfs_rtfind_forw(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = (*b ^ want) & mask)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = (incore ^ want) & mask)) {
 			/*
 			 * Different, mark where we are and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_RTLOBIT(wdiff);
-			*rtblock = start + i - 1;
+			*rtx = start + i - 1;
 			return 0;
 		} else
 			i = len;
@@ -426,11 +432,25 @@ xfs_rtfind_forw(
 	/*
 	 * No match, return that we scanned the whole area.
 	 */
-	xfs_trans_brelse(tp, bp);
-	*rtblock = start + i - 1;
+	*rtx = start + i - 1;
 	return 0;
 }
 
+/* Log rtsummary counter at @infoword. */
+static inline void
+xfs_trans_log_rtsummary(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		infoword)
+{
+	struct xfs_buf		*bp = args->sumbp;
+	size_t			first, last;
+
+	first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr;
+	last = first + sizeof(xfs_suminfo_t) - 1;
+
+	xfs_trans_log_buf(args->tp, bp, first, last);
+}
+
 /*
  * Read and/or modify the summary information for a given extent size,
  * bitmap block combination.
@@ -442,86 +462,77 @@ xfs_rtfind_forw(
  */
 int
 xfs_rtmodify_summary_int(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	int		delta,		/* change to make to summary info */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+	struct xfs_rtalloc_args	*args,
+	int			log,	/* log2 of extent size */
+	xfs_fileoff_t		bbno,	/* bitmap block number */
+	int			delta,	/* change to make to summary info */
+	xfs_suminfo_t		*sum)	/* out: summary info for this block */
 {
-	struct xfs_buf	*bp;		/* buffer for the summary block */
-	int		error;		/* error value */
-	xfs_fsblock_t	sb;		/* summary fsblock */
-	int		so;		/* index into the summary file */
-	xfs_suminfo_t	*sp;		/* pointer to returned data */
+	struct xfs_mount	*mp = args->mp;
+	int			error;
+	xfs_fileoff_t		sb;	/* summary fsblock */
+	xfs_rtsumoff_t		so;	/* index into the summary file */
+	unsigned int		infoword;
 
 	/*
 	 * Compute entry number in the summary file.
 	 */
-	so = XFS_SUMOFFS(mp, log, bbno);
+	so = xfs_rtsumoffs(mp, log, bbno);
 	/*
 	 * Compute the block number in the summary file.
 	 */
-	sb = XFS_SUMOFFSTOBLOCK(mp, so);
-	/*
-	 * If we have an old buffer, and the block number matches, use that.
-	 */
-	if (*rbpp && *rsb == sb)
-		bp = *rbpp;
-	/*
-	 * Otherwise we have to get the buffer.
-	 */
-	else {
-		/*
-		 * If there was an old one, get rid of it first.
-		 */
-		if (*rbpp)
-			xfs_trans_brelse(tp, *rbpp);
-		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
-		if (error) {
-			return error;
-		}
-		/*
-		 * Remember this buffer and block for the next call.
-		 */
-		*rbpp = bp;
-		*rsb = sb;
-	}
+	sb = xfs_rtsumoffs_to_block(mp, so);
+
+	error = xfs_rtsummary_read_buf(args, sb);
+	if (error)
+		return error;
+
 	/*
 	 * Point to the summary information, modify/log it, and/or copy it out.
 	 */
-	sp = XFS_SUMPTR(mp, bp, so);
+	infoword = xfs_rtsumoffs_to_infoword(mp, so);
 	if (delta) {
-		uint first = (uint)((char *)sp - (char *)bp->b_addr);
+		xfs_suminfo_t	val = xfs_suminfo_add(args, infoword, delta);
 
-		*sp += delta;
 		if (mp->m_rsum_cache) {
-			if (*sp == 0 && log == mp->m_rsum_cache[bbno])
-				mp->m_rsum_cache[bbno]++;
-			if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+			if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
 				mp->m_rsum_cache[bbno] = log;
+			if (val != 0 && log >= mp->m_rsum_cache[bbno])
+				mp->m_rsum_cache[bbno] = log + 1;
 		}
-		xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
+		xfs_trans_log_rtsummary(args, infoword);
+		if (sum)
+			*sum = val;
+	} else if (sum) {
+		*sum = xfs_suminfo_get(args, infoword);
 	}
-	if (sum)
-		*sum = *sp;
 	return 0;
 }
 
 int
 xfs_rtmodify_summary(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	int		delta,		/* change to make to summary info */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+	struct xfs_rtalloc_args	*args,
+	int			log,	/* log2 of extent size */
+	xfs_fileoff_t		bbno,	/* bitmap block number */
+	int			delta)	/* in/out: summary block number */
 {
-	return xfs_rtmodify_summary_int(mp, tp, log, bbno,
-					delta, rbpp, rsb, NULL);
+	return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+}
+
+/* Log rtbitmap block from the word @from to the byte before @next. */
+static inline void
+xfs_trans_log_rtbitmap(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		from,
+	unsigned int		next)
+{
+	struct xfs_buf		*bp = args->rbmbp;
+	size_t			first, last;
+
+	first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr;
+	last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr;
+
+	xfs_trans_log_buf(args->tp, bp, first, last);
 }
 
 /*
@@ -530,41 +541,37 @@ xfs_rtmodify_summary(
  */
 int
 xfs_rtmodify_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to modify */
-	xfs_extlen_t	len,		/* length of extent to modify */
-	int		val)		/* 1 for free, 0 for allocated */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext to modify */
+	xfs_rtxlen_t		len,	/* length of extent to modify */
+	int			val)	/* 1 for free, 0 for allocated */
 {
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	struct xfs_buf	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtword_t	*first;		/* first used word in the buffer */
-	int		i;		/* current bit number rel. to start */
-	int		lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
-	int		word;		/* word number in the buffer */
+	struct xfs_mount	*mp = args->mp;
+	int			bit;	/* bit number in the word */
+	xfs_fileoff_t		block;	/* bitmap block number */
+	int			error;
+	int			i;	/* current bit number rel. to start */
+	int			lastbit; /* last useful bit in word */
+	xfs_rtword_t		mask;	 /* mask of relevant bits for value */
+	xfs_rtword_t		incore;
+	unsigned int		firstword; /* first word used in the buffer */
+	unsigned int		word;	/* word number in the buffer */
 
 	/*
 	 * Compute starting bitmap block number.
 	 */
-	block = XFS_BITTOBLOCK(mp, start);
+	block = xfs_rtx_to_rbmblock(mp, start);
 	/*
 	 * Read the bitmap block, and point to its data.
 	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
+	error = xfs_rtbitmap_read_buf(args, block);
+	if (error)
 		return error;
-	}
-	bufp = bp->b_addr;
+
 	/*
 	 * Compute the starting word's address, and starting bit.
 	 */
-	word = XFS_BITTOWORD(mp, start);
-	first = b = &bufp[word];
+	firstword = word = xfs_rtx_to_rbmword(mp, start);
 	bit = (int)(start & (XFS_NBWORD - 1));
 	/*
 	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
@@ -583,34 +590,28 @@ xfs_rtmodify_range(
 		/*
 		 * Set/clear the active bits.
 		 */
+		incore = xfs_rtbitmap_getword(args, word);
 		if (val)
-			*b |= mask;
+			incore |= mask;
 		else
-			*b &= ~mask;
+			incore &= ~mask;
+		xfs_rtbitmap_setword(args, word, incore);
 		i = lastbit - bit;
 		/*
 		 * Go on to the next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * Log the changed part of this block.
 			 * Get the next one.
 			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			xfs_trans_log_rtbitmap(args, firstword, word);
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer
-			 */
-			b++;
+
+			firstword = word = 0;
 		}
 	} else {
 		/*
@@ -626,31 +627,23 @@ xfs_rtmodify_range(
 		/*
 		 * Set the word value correctly.
 		 */
-		*b = val;
+		xfs_rtbitmap_setword(args, word, val);
 		i += XFS_NBWORD;
 		/*
 		 * Go on to the next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * Log the changed part of this block.
 			 * Get the next one.
 			 */
-			xfs_trans_log_buf(tp, bp,
-				(uint)((char *)first - (char *)bufp),
-				(uint)((char *)b - (char *)bufp));
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			xfs_trans_log_rtbitmap(args, firstword, word);
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			first = b = bufp = bp->b_addr;
-			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer
-			 */
-			b++;
+
+			firstword = word = 0;
 		}
 	}
 	/*
@@ -665,18 +658,19 @@ xfs_rtmodify_range(
 		/*
 		 * Set/clear the active bits.
 		 */
+		incore = xfs_rtbitmap_getword(args, word);
 		if (val)
-			*b |= mask;
+			incore |= mask;
 		else
-			*b &= ~mask;
-		b++;
+			incore &= ~mask;
+		xfs_rtbitmap_setword(args, word, incore);
+		word++;
 	}
 	/*
 	 * Log any remaining changed bytes.
 	 */
-	if (b > first)
-		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
-			(uint)((char *)b - (char *)bufp - 1));
+	if (word > firstword)
+		xfs_trans_log_rtbitmap(args, firstword, word);
 	return 0;
 }
 
@@ -686,23 +680,21 @@ xfs_rtmodify_range(
  */
 int
 xfs_rtfree_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block to free */
-	xfs_extlen_t	len,		/* length to free */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext to free */
+	xfs_rtxlen_t		len)	/* in/out: summary block number */
 {
-	xfs_rtblock_t	end;		/* end of the freed extent */
-	int		error;		/* error value */
-	xfs_rtblock_t	postblock;	/* first block freed > end */
-	xfs_rtblock_t	preblock;	/* first block freed < start */
+	struct xfs_mount	*mp = args->mp;
+	xfs_rtxnum_t		end;	/* end of the freed extent */
+	int			error;	/* error value */
+	xfs_rtxnum_t		postblock; /* first rtext freed > end */
+	xfs_rtxnum_t		preblock;  /* first rtext freed < start */
 
 	end = start + len - 1;
 	/*
 	 * Modify the bitmap to mark this extent freed.
 	 */
-	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	error = xfs_rtmodify_range(args, start, len, 1);
 	if (error) {
 		return error;
 	}
@@ -711,15 +703,15 @@ xfs_rtfree_range(
 	 * We need to find the beginning and end of the extent so we can
 	 * properly update the summary.
 	 */
-	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	error = xfs_rtfind_back(args, start, 0, &preblock);
 	if (error) {
 		return error;
 	}
 	/*
 	 * Find the next allocated block (end of allocated extent).
 	 */
-	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-		&postblock);
+	error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+			&postblock);
 	if (error)
 		return error;
 	/*
@@ -727,9 +719,9 @@ xfs_rtfree_range(
 	 * old extent, add summary data for them to be allocated.
 	 */
 	if (preblock < start) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(start - preblock),
-			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		error = xfs_rtmodify_summary(args,
+				XFS_RTBLOCKLOG(start - preblock),
+				xfs_rtx_to_rbmblock(mp, preblock), -1);
 		if (error) {
 			return error;
 		}
@@ -739,9 +731,9 @@ xfs_rtfree_range(
 	 * old extent, add summary data for them to be allocated.
 	 */
 	if (postblock > end) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(postblock - end),
-			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		error = xfs_rtmodify_summary(args,
+				XFS_RTBLOCKLOG(postblock - end),
+				xfs_rtx_to_rbmblock(mp, end + 1), -1);
 		if (error) {
 			return error;
 		}
@@ -750,10 +742,9 @@ xfs_rtfree_range(
 	 * Increment the summary information corresponding to the entire
 	 * (new) free extent.
 	 */
-	error = xfs_rtmodify_summary(mp, tp,
-		XFS_RTBLOCKLOG(postblock + 1 - preblock),
-		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
-	return error;
+	return xfs_rtmodify_summary(args,
+			XFS_RTBLOCKLOG(postblock + 1 - preblock),
+			xfs_rtx_to_rbmblock(mp, preblock), 1);
 }
 
 /*
@@ -762,43 +753,39 @@ xfs_rtfree_range(
  */
 int
 xfs_rtcheck_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* starting block number of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		val,		/* 1 for free, 0 for allocated */
-	xfs_rtblock_t	*new,		/* out: first block not matching */
-	int		*stat)		/* out: 1 for matches, 0 for not */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext number of extent */
+	xfs_rtxlen_t		len,	/* length of extent */
+	int			val,	/* 1 for free, 0 for allocated */
+	xfs_rtxnum_t		*new,	/* out: first rtext not matching */
+	int			*stat)	/* out: 1 for matches, 0 for not */
 {
-	xfs_rtword_t	*b;		/* current word in buffer */
-	int		bit;		/* bit number in the word */
-	xfs_rtblock_t	block;		/* bitmap block number */
-	struct xfs_buf	*bp;		/* buf for the block */
-	xfs_rtword_t	*bufp;		/* starting word in buffer */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current bit number rel. to start */
-	xfs_rtblock_t	lastbit;	/* last useful bit in word */
-	xfs_rtword_t	mask;		/* mask of relevant bits for value */
-	xfs_rtword_t	wdiff;		/* difference from wanted value */
-	int		word;		/* word number in the buffer */
+	struct xfs_mount	*mp = args->mp;
+	int			bit;	/* bit number in the word */
+	xfs_fileoff_t		block;	/* bitmap block number */
+	int			error;
+	xfs_rtxnum_t		i;	/* current bit number rel. to start */
+	xfs_rtxnum_t		lastbit; /* last useful bit in word */
+	xfs_rtword_t		mask;	/* mask of relevant bits for value */
+	xfs_rtword_t		wdiff;	/* difference from wanted value */
+	xfs_rtword_t		incore;
+	unsigned int		word;	/* word number in the buffer */
 
 	/*
 	 * Compute starting bitmap block number
 	 */
-	block = XFS_BITTOBLOCK(mp, start);
+	block = xfs_rtx_to_rbmblock(mp, start);
 	/*
 	 * Read the bitmap block.
 	 */
-	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
-	if (error) {
+	error = xfs_rtbitmap_read_buf(args, block);
+	if (error)
 		return error;
-	}
-	bufp = bp->b_addr;
+
 	/*
 	 * Compute the starting word's address, and starting bit.
 	 */
-	word = XFS_BITTOWORD(mp, start);
-	b = &bufp[word];
+	word = xfs_rtx_to_rbmword(mp, start);
 	bit = (int)(start & (XFS_NBWORD - 1));
 	/*
 	 * 0 (allocated) => all zero's; 1 (free) => all one's.
@@ -820,11 +807,11 @@ xfs_rtcheck_range(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = (*b ^ val) & mask)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = (incore ^ val) & mask)) {
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i = XFS_RTLOBIT(wdiff) - bit;
 			*new = start + i;
 			*stat = 0;
@@ -835,22 +822,15 @@ xfs_rtcheck_range(
 		 * Go on to next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * If done with this block, get the next one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			b = bufp = bp->b_addr;
+
 			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
 		}
 	} else {
 		/*
@@ -866,11 +846,11 @@ xfs_rtcheck_range(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = *b ^ val)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = incore ^ val)) {
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_RTLOBIT(wdiff);
 			*new = start + i;
 			*stat = 0;
@@ -881,22 +861,15 @@ xfs_rtcheck_range(
 		 * Go on to next block if that's where the next word is
 		 * and we need the next word.
 		 */
-		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+		if (++word == mp->m_blockwsize && i < len) {
 			/*
 			 * If done with this block, get the next one.
 			 */
-			xfs_trans_brelse(tp, bp);
-			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
-			if (error) {
+			error = xfs_rtbitmap_read_buf(args, ++block);
+			if (error)
 				return error;
-			}
-			b = bufp = bp->b_addr;
+
 			word = 0;
-		} else {
-			/*
-			 * Go on to the next word in the buffer.
-			 */
-			b++;
 		}
 	}
 	/*
@@ -911,11 +884,11 @@ xfs_rtcheck_range(
 		/*
 		 * Compute difference between actual and desired value.
 		 */
-		if ((wdiff = (*b ^ val) & mask)) {
+		incore = xfs_rtbitmap_getword(args, word);
+		if ((wdiff = (incore ^ val) & mask)) {
 			/*
 			 * Different, compute first wrong bit and return.
 			 */
-			xfs_trans_brelse(tp, bp);
 			i += XFS_RTLOBIT(wdiff);
 			*new = start + i;
 			*stat = 0;
@@ -926,7 +899,6 @@ xfs_rtcheck_range(
 	/*
 	 * Successful, return.
 	 */
-	xfs_trans_brelse(tp, bp);
 	*new = start + i;
 	*stat = 1;
 	return 0;
@@ -936,57 +908,57 @@ xfs_rtcheck_range(
 /*
  * Check that the given extent (block range) is allocated already.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtcheck_alloc_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number of extent */
-	xfs_extlen_t	len)		/* length of extent */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext number of extent */
+	xfs_rtxlen_t		len)	/* length of extent */
 {
-	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
-	int		stat;
-	int		error;
+	xfs_rtxnum_t		new;	/* dummy for xfs_rtcheck_range */
+	int			stat;
+	int			error;
 
-	error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+	error = xfs_rtcheck_range(args, start, len, 0, &new, &stat);
 	if (error)
 		return error;
 	ASSERT(stat);
 	return 0;
 }
 #else
-#define xfs_rtcheck_alloc_range(m,t,b,l)	(0)
+#define xfs_rtcheck_alloc_range(a,b,l)	(0)
 #endif
 /*
  * Free an extent in the realtime subvolume.  Length is expressed in
  * realtime extents, as is the block number.
  */
-int					/* error */
+int
 xfs_rtfree_extent(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to free */
-	xfs_extlen_t	len)		/* length of extent freed */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtxnum_t		start,	/* starting rtext number to free */
+	xfs_rtxlen_t		len)	/* length of extent freed */
 {
-	int		error;		/* error value */
-	xfs_mount_t	*mp;		/* file system mount structure */
-	xfs_fsblock_t	sb;		/* summary file block number */
-	struct xfs_buf	*sumbp = NULL;	/* summary file block buffer */
-
-	mp = tp->t_mountp;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_rtalloc_args	args = {
+		.mp		= mp,
+		.tp		= tp,
+	};
+	int			error;
+	struct timespec64	atime;
 
 	ASSERT(mp->m_rbmip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
-	error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+	error = xfs_rtcheck_alloc_range(&args, start, len);
 	if (error)
 		return error;
 
 	/*
 	 * Free the range of realtime blocks.
 	 */
-	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
-	if (error) {
-		return error;
-	}
+	error = xfs_rtfree_range(&args, start, len);
+	if (error)
+		goto out;
+
 	/*
 	 * Mark more blocks free in the superblock.
 	 */
@@ -999,10 +971,49 @@ xfs_rtfree_extent(
 	    mp->m_sb.sb_rextents) {
 		if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
 			mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
-		*(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+
+		atime = inode_get_atime(VFS_I(mp->m_rbmip));
+		atime.tv_sec = 0;
+		inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
 		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	}
-	return 0;
+	error = 0;
+out:
+	xfs_rtbuf_cache_relse(&args);
+	return error;
+}
+
+/*
+ * Free some blocks in the realtime subvolume.  rtbno and rtlen are in units of
+ * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen
+ * cannot exceed XFS_MAX_BMBT_EXTLEN.
+ */
+int
+xfs_rtfree_blocks(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		rtbno,
+	xfs_filblks_t		rtlen)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_rtxnum_t		start;
+	xfs_filblks_t		len;
+	xfs_extlen_t		mod;
+
+	ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
+
+	len = xfs_rtb_to_rtxrem(mp, rtlen, &mod);
+	if (mod) {
+		ASSERT(mod == 0);
+		return -EIO;
+	}
+
+	start = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+	if (mod) {
+		ASSERT(mod == 0);
+		return -EIO;
+	}
+
+	return xfs_rtfree_extent(tp, start, len);
 }
 
 /* Find all the free records within a given range. */
@@ -1015,10 +1026,14 @@ xfs_rtalloc_query_range(
 	xfs_rtalloc_query_range_fn	fn,
 	void				*priv)
 {
+	struct xfs_rtalloc_args		args = {
+		.mp			= mp,
+		.tp			= tp,
+	};
 	struct xfs_rtalloc_rec		rec;
-	xfs_rtblock_t			rtstart;
-	xfs_rtblock_t			rtend;
-	xfs_rtblock_t			high_key;
+	xfs_rtxnum_t			rtstart;
+	xfs_rtxnum_t			rtend;
+	xfs_rtxnum_t			high_key;
 	int				is_free;
 	int				error = 0;
 
@@ -1034,13 +1049,13 @@ xfs_rtalloc_query_range(
 	rtstart = low_rec->ar_startext;
 	while (rtstart <= high_key) {
 		/* Is the first block free? */
-		error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+		error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend,
 				&is_free);
 		if (error)
 			break;
 
 		/* How long does the extent go for? */
-		error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend);
+		error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend);
 		if (error)
 			break;
 
@@ -1056,6 +1071,7 @@ xfs_rtalloc_query_range(
 		rtstart = rtend + 1;
 	}
 
+	xfs_rtbuf_cache_relse(&args);
 	return error;
 }
 
@@ -1081,18 +1097,79 @@ int
 xfs_rtalloc_extent_is_free(
 	struct xfs_mount		*mp,
 	struct xfs_trans		*tp,
-	xfs_rtblock_t			start,
-	xfs_extlen_t			len,
+	xfs_rtxnum_t			start,
+	xfs_rtxlen_t			len,
 	bool				*is_free)
 {
-	xfs_rtblock_t			end;
+	struct xfs_rtalloc_args		args = {
+		.mp			= mp,
+		.tp			= tp,
+	};
+	xfs_rtxnum_t			end;
 	int				matches;
 	int				error;
 
-	error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches);
+	error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches);
+	xfs_rtbuf_cache_relse(&args);
 	if (error)
 		return error;
 
 	*is_free = matches;
 	return 0;
 }
+
+/*
+ * Compute the number of rtbitmap blocks needed to track the given number of rt
+ * extents.
+ */
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
+	struct xfs_mount	*mp,
+	xfs_rtbxlen_t		rtextents)
+{
+	return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Compute the number of rtbitmap words needed to populate every block of a
+ * bitmap that is large enough to track the given number of rt extents.
+ */
+unsigned long long
+xfs_rtbitmap_wordcount(
+	struct xfs_mount	*mp,
+	xfs_rtbxlen_t		rtextents)
+{
+	xfs_filblks_t		blocks;
+
+	blocks = xfs_rtbitmap_blockcount(mp, rtextents);
+	return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
+
+/* Compute the number of rtsummary blocks needed to track the given rt space. */
+xfs_filblks_t
+xfs_rtsummary_blockcount(
+	struct xfs_mount	*mp,
+	unsigned int		rsumlevels,
+	xfs_extlen_t		rbmblocks)
+{
+	unsigned long long	rsumwords;
+
+	rsumwords = (unsigned long long)rsumlevels * rbmblocks;
+	return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
+}
+
+/*
+ * Compute the number of rtsummary info words needed to populate every block of
+ * a summary file that is large enough to track the given rt space.
+ */
+unsigned long long
+xfs_rtsummary_wordcount(
+	struct xfs_mount	*mp,
+	unsigned int		rsumlevels,
+	xfs_extlen_t		rbmblocks)
+{
+	xfs_filblks_t		blocks;
+
+	blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
+	return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
new file mode 100644
index 000000000000..c0637057d69c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_RTBITMAP_H__
+#define	__XFS_RTBITMAP_H__
+
+struct xfs_rtalloc_args {
+	struct xfs_mount	*mp;
+	struct xfs_trans	*tp;
+
+	struct xfs_buf		*rbmbp;	/* bitmap block buffer */
+	struct xfs_buf		*sumbp;	/* summary block buffer */
+
+	xfs_fileoff_t		rbmoff;	/* bitmap block number */
+	xfs_fileoff_t		sumoff;	/* summary block number */
+};
+
+static inline xfs_rtblock_t
+xfs_rtx_to_rtb(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		rtx)
+{
+	if (mp->m_rtxblklog >= 0)
+		return rtx << mp->m_rtxblklog;
+
+	return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_extlen_t
+xfs_rtxlen_to_extlen(
+	struct xfs_mount	*mp,
+	xfs_rtxlen_t		rtxlen)
+{
+	if (mp->m_rtxblklog >= 0)
+		return rtxlen << mp->m_rtxblklog;
+
+	return rtxlen * mp->m_sb.sb_rextsize;
+}
+
+/* Compute the misalignment between an extent length and a realtime extent .*/
+static inline unsigned int
+xfs_extlen_to_rtxmod(
+	struct xfs_mount	*mp,
+	xfs_extlen_t		len)
+{
+	if (mp->m_rtxblklog >= 0)
+		return len & mp->m_rtxblkmask;
+
+	return len % mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_rtxlen_t
+xfs_extlen_to_rtxlen(
+	struct xfs_mount	*mp,
+	xfs_extlen_t		len)
+{
+	if (mp->m_rtxblklog >= 0)
+		return len >> mp->m_rtxblklog;
+
+	return len / mp->m_sb.sb_rextsize;
+}
+
+/* Convert an rt block number into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtx(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	if (likely(mp->m_rtxblklog >= 0))
+		return rtbno >> mp->m_rtxblklog;
+
+	return div_u64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of an rt block number within an rt extent. */
+static inline xfs_extlen_t
+xfs_rtb_to_rtxoff(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	if (likely(mp->m_rtxblklog >= 0))
+		return rtbno & mp->m_rtxblkmask;
+
+	return do_div(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/*
+ * Crack an rt block number into an rt extent number and an offset within that
+ * rt extent.  Returns the rt extent number directly and the offset in @off.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxrem(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno,
+	xfs_extlen_t		*off)
+{
+	if (likely(mp->m_rtxblklog >= 0)) {
+		*off = rtbno & mp->m_rtxblkmask;
+		return rtbno >> mp->m_rtxblklog;
+	}
+
+	return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off);
+}
+
+/*
+ * Convert an rt block number into an rt extent number, rounding up to the next
+ * rt extent if the rt block is not aligned to an rt extent boundary.
+ */
+static inline xfs_rtxnum_t
+xfs_rtb_to_rtxup(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	if (likely(mp->m_rtxblklog >= 0)) {
+		if (rtbno & mp->m_rtxblkmask)
+			return (rtbno >> mp->m_rtxblklog) + 1;
+		return rtbno >> mp->m_rtxblklog;
+	}
+
+	if (do_div(rtbno, mp->m_sb.sb_rextsize))
+		rtbno++;
+	return rtbno;
+}
+
+/* Round this rtblock up to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_roundup_rtx(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Round this rtblock down to the nearest rt extent size. */
+static inline xfs_rtblock_t
+xfs_rtb_rounddown_rtx(
+	struct xfs_mount	*mp,
+	xfs_rtblock_t		rtbno)
+{
+	return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rt extent number to a file block offset in the rt bitmap file. */
+static inline xfs_fileoff_t
+xfs_rtx_to_rbmblock(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		rtx)
+{
+	return rtx >> mp->m_blkbit_log;
+}
+
+/* Convert an rt extent number to a word offset within an rt bitmap block. */
+static inline unsigned int
+xfs_rtx_to_rbmword(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		rtx)
+{
+	return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
+}
+
+/* Convert a file block offset in the rt bitmap file to an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rbmblock_to_rtx(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		rbmoff)
+{
+	return rbmoff << mp->m_blkbit_log;
+}
+
+/* Return a pointer to a bitmap word within a rt bitmap block. */
+static inline union xfs_rtword_raw *
+xfs_rbmblock_wordptr(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index)
+{
+	union xfs_rtword_raw	*words = args->rbmbp->b_addr;
+
+	return words + index;
+}
+
+/* Convert an ondisk bitmap word to its incore representation. */
+static inline xfs_rtword_t
+xfs_rtbitmap_getword(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index)
+{
+	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
+
+	return word->old;
+}
+
+/* Set an ondisk bitmap word from an incore representation. */
+static inline void
+xfs_rtbitmap_setword(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index,
+	xfs_rtword_t		value)
+{
+	union xfs_rtword_raw	*word = xfs_rbmblock_wordptr(args, index);
+
+	word->old = value;
+}
+
+/*
+ * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t
+ * offset within the rt summary file.
+ */
+static inline xfs_rtsumoff_t
+xfs_rtsumoffs(
+	struct xfs_mount	*mp,
+	int			log2_len,
+	xfs_fileoff_t		rbmoff)
+{
+	return log2_len * mp->m_sb.sb_rbmblocks + rbmoff;
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to a file block offset within the rt summary
+ * file.
+ */
+static inline xfs_fileoff_t
+xfs_rtsumoffs_to_block(
+	struct xfs_mount	*mp,
+	xfs_rtsumoff_t		rsumoff)
+{
+	return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
+}
+
+/*
+ * Convert an xfs_suminfo_t offset to an info word offset within an rt summary
+ * block.
+ */
+static inline unsigned int
+xfs_rtsumoffs_to_infoword(
+	struct xfs_mount	*mp,
+	xfs_rtsumoff_t		rsumoff)
+{
+	unsigned int		mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+
+	return rsumoff & mask;
+}
+
+/* Return a pointer to a summary info word within a rt summary block. */
+static inline union xfs_suminfo_raw *
+xfs_rsumblock_infoptr(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index)
+{
+	union xfs_suminfo_raw	*info = args->sumbp->b_addr;
+
+	return info + index;
+}
+
+/* Get the current value of a summary counter. */
+static inline xfs_suminfo_t
+xfs_suminfo_get(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index)
+{
+	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
+
+	return info->old;
+}
+
+/* Add to the current value of a summary counter and return the new value. */
+static inline xfs_suminfo_t
+xfs_suminfo_add(
+	struct xfs_rtalloc_args	*args,
+	unsigned int		index,
+	int			delta)
+{
+	union xfs_suminfo_raw	*info = xfs_rsumblock_infoptr(args, index);
+
+	info->old += delta;
+	return info->old;
+}
+
+/*
+ * Functions for walking free space rtextents in the realtime bitmap.
+ */
+struct xfs_rtalloc_rec {
+	xfs_rtxnum_t		ar_startext;
+	xfs_rtbxlen_t		ar_extcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv);
+
+#ifdef CONFIG_XFS_RT
+void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args);
+
+int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block,
+		int issum);
+
+static inline int
+xfs_rtbitmap_read_buf(
+	struct xfs_rtalloc_args		*args,
+	xfs_fileoff_t			block)
+{
+	return xfs_rtbuf_get(args, block, 0);
+}
+
+static inline int
+xfs_rtsummary_read_buf(
+	struct xfs_rtalloc_args		*args,
+	xfs_fileoff_t			block)
+{
+	return xfs_rtbuf_get(args, block, 1);
+}
+
+int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+		xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+		xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+		xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+		xfs_rtxlen_t len, int val);
+int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
+		xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
+		xfs_fileoff_t bbno, int delta);
+int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
+		xfs_rtxlen_t len);
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+		const struct xfs_rtalloc_rec *low_rec,
+		const struct xfs_rtalloc_rec *high_rec,
+		xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
+			  xfs_rtalloc_query_range_fn fn,
+			  void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
+			       xfs_rtxnum_t start, xfs_rtxlen_t len,
+			       bool *is_free);
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtxnum_t		start,	/* starting rtext number to free */
+	xfs_rtxlen_t		len);	/* length of extent freed */
+
+/* Same as above, but in units of rt blocks. */
+int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
+		xfs_filblks_t rtlen);
+
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
+		rtextents);
+unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
+		xfs_rtbxlen_t rtextents);
+
+xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
+		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
+		unsigned int rsumlevels, xfs_extlen_t rbmblocks);
+#else /* CONFIG_XFS_RT */
+# define xfs_rtfree_extent(t,b,l)			(-ENOSYS)
+# define xfs_rtfree_blocks(t,rb,rl)			(-ENOSYS)
+# define xfs_rtalloc_query_range(m,t,l,h,f,p)		(-ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p)			(-ENOSYS)
+# define xfs_rtbitmap_read_buf(a,b)			(-ENOSYS)
+# define xfs_rtsummary_read_buf(a,b)			(-ENOSYS)
+# define xfs_rtbuf_cache_relse(a)			(0)
+# define xfs_rtalloc_extent_is_free(m,t,s,l,i)		(-ENOSYS)
+static inline xfs_filblks_t
+xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+{
+	/* shut up gcc */
+	return 0;
+}
+# define xfs_rtbitmap_wordcount(mp, r)			(0)
+# define xfs_rtsummary_blockcount(mp, l, b)		(0)
+# define xfs_rtsummary_wordcount(mp, l, b)		(0)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6264daaab37b..1f74d0cd1618 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -975,6 +975,8 @@ xfs_sb_mount_common(
 	mp->m_blockmask = sbp->sb_blocksize - 1;
 	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
 	mp->m_blockwmask = mp->m_blockwsize - 1;
+	mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
+	mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
 
 	mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
 	mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index a5e14740ec9a..19134b23c10b 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -25,7 +25,7 @@ extern uint64_t	xfs_sb_version_to_features(struct xfs_sb *sbp);
 
 extern int	xfs_update_secondary_sbs(struct xfs_mount *mp);
 
-#define XFS_FS_GEOM_MAX_STRUCT_VER	(4)
+#define XFS_FS_GEOM_MAX_STRUCT_VER	(5)
 extern void	xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo,
 				int struct_version);
 extern int	xfs_sb_read_secondary(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 6b2296ff248a..70e97ea6eee7 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -65,7 +65,7 @@ xfs_trans_ichgtime(
 	tv = current_time(inode);
 
 	if (flags & XFS_ICHGTIME_MOD)
-		inode->i_mtime = tv;
+		inode_set_mtime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CHG)
 		inode_set_ctime_to_ts(inode, tv);
 	if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 5b2f27cbdb80..6cd45e8c118d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -19,6 +19,7 @@
 #include "xfs_trans.h"
 #include "xfs_qm.h"
 #include "xfs_trans_space.h"
+#include "xfs_rtbitmap.h"
 
 #define _ALLOC	true
 #define _FREE	false
@@ -217,11 +218,12 @@ xfs_rtalloc_block_count(
 	struct xfs_mount	*mp,
 	unsigned int		num_ops)
 {
-	unsigned int		blksz = XFS_FSB_TO_B(mp, 1);
-	unsigned int		rtbmp_bytes;
+	unsigned int		rtbmp_blocks;
+	xfs_rtxlen_t		rtxlen;
 
-	rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
-	return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
+	rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
+	rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
+	return (rtbmp_blocks + 1) * num_ops;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 5c2765934732..c299b16c9365 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -148,10 +148,10 @@ xfs_verify_rtbno(
 
 /* Verify that a realtime device extent is fully contained inside the volume. */
 bool
-xfs_verify_rtext(
+xfs_verify_rtbext(
 	struct xfs_mount	*mp,
 	xfs_rtblock_t		rtbno,
-	xfs_rtblock_t		len)
+	xfs_filblks_t		len)
 {
 	if (rtbno + len <= rtbno)
 		return false;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 851220021484..533200c4ccc2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -11,6 +11,7 @@ typedef uint32_t	prid_t;		/* project ID */
 typedef uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
 typedef uint32_t	xfs_agino_t;	/* inode # within allocation grp */
 typedef uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef uint32_t	xfs_rtxlen_t;	/* file extent length in rtextents */
 typedef uint32_t	xfs_agnumber_t;	/* allocation group number */
 typedef uint64_t	xfs_extnum_t;	/* # of extents in a file */
 typedef uint32_t	xfs_aextnum_t;	/* # extents in an attribute fork */
@@ -18,6 +19,7 @@ typedef int64_t		xfs_fsize_t;	/* bytes in a file */
 typedef uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
 
 typedef int32_t		xfs_suminfo_t;	/* type of bitmap summary info */
+typedef uint32_t	xfs_rtsumoff_t;	/* offset of an rtsummary info word */
 typedef uint32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
 
 typedef int64_t		xfs_lsn_t;	/* log sequence number */
@@ -31,6 +33,8 @@ typedef uint64_t	xfs_rfsblock_t;	/* blockno in filesystem (raw) */
 typedef uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
 typedef uint64_t	xfs_fileoff_t;	/* block number in a file */
 typedef uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+typedef uint64_t	xfs_rtxnum_t;	/* rtextent number */
+typedef uint64_t	xfs_rtbxlen_t;	/* rtbitmap extent length in rtextents */
 
 typedef int64_t		xfs_srtblock_t;	/* signed version of xfs_rtblock_t */
 
@@ -47,6 +51,7 @@ typedef void *		xfs_failaddr_t;
 #define	NULLRFSBLOCK	((xfs_rfsblock_t)-1)
 #define	NULLRTBLOCK	((xfs_rtblock_t)-1)
 #define	NULLFILEOFF	((xfs_fileoff_t)-1)
+#define	NULLRTEXTNO	((xfs_rtxnum_t)-1)
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
@@ -145,6 +150,7 @@ typedef uint32_t	xfs_dqid_t;
  */
 #define	XFS_NBBYLOG	3		/* log2(NBBY) */
 #define	XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define	XFS_SUMINFOLOG	2		/* log2(sizeof(xfs_suminfo_t)) */
 #define	XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
 #define	XFS_NBWORD	(1 << XFS_NBWORDLOG)
 #define	XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
@@ -229,8 +235,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
 bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
-		xfs_rtblock_t len);
+bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
+		xfs_filblks_t len);
 bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
 bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
 void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 75588915572e..06d8c1996a33 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -410,7 +410,7 @@ xchk_bmap_iextent(
 
 	/* Make sure the extent points to a valid place. */
 	if (info->is_rt &&
-	    !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount))
+	    !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount))
 		xchk_fblock_set_corrupt(info->sc, info->whichfork,
 				irec->br_startoff);
 	if (!info->is_rt &&
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 05be757668bb..5799e9a94f1f 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -16,7 +16,7 @@
 #include "xfs_health.h"
 #include "xfs_btree.h"
 #include "xfs_ag.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
 #include "xfs_inode.h"
 #include "xfs_icache.h"
 #include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 59d7912fb75f..889f556bc98f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -20,6 +20,7 @@
 #include "xfs_reflink.h"
 #include "xfs_rmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_rtbitmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -225,7 +226,7 @@ xchk_inode_extsize(
 	 */
 	if ((flags & XFS_DIFLAG_RTINHERIT) &&
 	    (flags & XFS_DIFLAG_EXTSZINHERIT) &&
-	    value % sc->mp->m_sb.sb_rextsize > 0)
+	    xfs_extlen_to_rtxmod(sc->mp, value) > 0)
 		xchk_ino_set_warning(sc, ino);
 }
 
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 008ddb599e13..41a1d89ae8e6 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -11,7 +11,7 @@
 #include "xfs_mount.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "scrub/scrub.h"
@@ -48,12 +48,12 @@ xchk_rtbitmap_rec(
 {
 	struct xfs_scrub	*sc = priv;
 	xfs_rtblock_t		startblock;
-	xfs_rtblock_t		blockcount;
+	xfs_filblks_t		blockcount;
 
-	startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
-	blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
+	startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
+	blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
 
-	if (!xfs_verify_rtext(mp, startblock, blockcount))
+	if (!xfs_verify_rtbext(mp, startblock, blockcount))
 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
 	return 0;
 }
@@ -128,26 +128,22 @@ out:
 void
 xchk_xref_is_used_rt_space(
 	struct xfs_scrub	*sc,
-	xfs_rtblock_t		fsbno,
+	xfs_rtblock_t		rtbno,
 	xfs_extlen_t		len)
 {
-	xfs_rtblock_t		startext;
-	xfs_rtblock_t		endext;
-	xfs_rtblock_t		extcount;
+	xfs_rtxnum_t		startext;
+	xfs_rtxnum_t		endext;
 	bool			is_free;
 	int			error;
 
 	if (xchk_skip_xref(sc->sm))
 		return;
 
-	startext = fsbno;
-	endext = fsbno + len - 1;
-	do_div(startext, sc->mp->m_sb.sb_rextsize);
-	do_div(endext, sc->mp->m_sb.sb_rextsize);
-	extcount = endext - startext + 1;
+	startext = xfs_rtb_to_rtx(sc->mp, rtbno);
+	endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
 	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
-	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
-			&is_free);
+	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+			endext - startext + 1, &is_free);
 	if (!xchk_should_check_xref(sc, &error, NULL))
 		goto out_unlock;
 	if (is_free)
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 437ed9acbb27..8b15c47408d0 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -13,7 +13,7 @@
 #include "xfs_inode.h"
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
 #include "xfs_bit.h"
 #include "xfs_bmap.h"
 #include "scrub/scrub.h"
@@ -81,34 +81,45 @@ typedef unsigned int xchk_rtsumoff_t;
 static inline int
 xfsum_load(
 	struct xfs_scrub	*sc,
-	xchk_rtsumoff_t		sumoff,
-	xfs_suminfo_t		*info)
+	xfs_rtsumoff_t		sumoff,
+	union xfs_suminfo_raw	*rawinfo)
 {
-	return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+	return xfile_obj_load(sc->xfile, rawinfo,
+			sizeof(union xfs_suminfo_raw),
 			sumoff << XFS_WORDLOG);
 }
 
 static inline int
 xfsum_store(
 	struct xfs_scrub	*sc,
-	xchk_rtsumoff_t		sumoff,
-	const xfs_suminfo_t	info)
+	xfs_rtsumoff_t		sumoff,
+	const union xfs_suminfo_raw rawinfo)
 {
-	return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+	return xfile_obj_store(sc->xfile, &rawinfo,
+			sizeof(union xfs_suminfo_raw),
 			sumoff << XFS_WORDLOG);
 }
 
 static inline int
 xfsum_copyout(
 	struct xfs_scrub	*sc,
-	xchk_rtsumoff_t		sumoff,
-	xfs_suminfo_t		*info,
+	xfs_rtsumoff_t		sumoff,
+	union xfs_suminfo_raw	*rawinfo,
 	unsigned int		nr_words)
 {
-	return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+	return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
 			sumoff << XFS_WORDLOG);
 }
 
+static inline xfs_suminfo_t
+xchk_rtsum_inc(
+	struct xfs_mount	*mp,
+	union xfs_suminfo_raw	*v)
+{
+	v->old += 1;
+	return v->old;
+}
+
 /* Update the summary file to reflect the free extent that we've accumulated. */
 STATIC int
 xchk_rtsum_record_free(
@@ -121,23 +132,24 @@ xchk_rtsum_record_free(
 	xfs_fileoff_t			rbmoff;
 	xfs_rtblock_t			rtbno;
 	xfs_filblks_t			rtlen;
-	xchk_rtsumoff_t			offs;
+	xfs_rtsumoff_t			offs;
 	unsigned int			lenlog;
-	xfs_suminfo_t			v = 0;
+	union xfs_suminfo_raw		v;
+	xfs_suminfo_t			value;
 	int				error = 0;
 
 	if (xchk_should_terminate(sc, &error))
 		return error;
 
 	/* Compute the relevant location in the rtsum file. */
-	rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+	rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
 	lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
-	offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+	offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
 
-	rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
-	rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+	rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+	rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
 
-	if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+	if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
 		xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
 		return -EFSCORRUPTED;
 	}
@@ -147,9 +159,9 @@ xchk_rtsum_record_free(
 	if (error)
 		return error;
 
-	v++;
+	value = xchk_rtsum_inc(sc->mp, &v);
 	trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
-			lenlog, offs, v);
+			lenlog, offs, value);
 
 	return xfsum_store(sc, offs, v);
 }
@@ -160,12 +172,11 @@ xchk_rtsum_compute(
 	struct xfs_scrub	*sc)
 {
 	struct xfs_mount	*mp = sc->mp;
-	unsigned long long	rtbmp_bytes;
+	unsigned long long	rtbmp_blocks;
 
 	/* If the bitmap size doesn't match the computed size, bail. */
-	rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
-	if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
-			mp->m_rbmip->i_disk_size)
+	rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
+	if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
 		return -EFSCORRUPTED;
 
 	return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
@@ -177,14 +188,18 @@ STATIC int
 xchk_rtsum_compare(
 	struct xfs_scrub	*sc)
 {
+	struct xfs_rtalloc_args args = {
+		.mp		= sc->mp,
+		.tp		= sc->tp,
+	};
 	struct xfs_mount	*mp = sc->mp;
-	struct xfs_buf		*bp;
 	struct xfs_bmbt_irec	map;
 	xfs_fileoff_t		off;
 	xchk_rtsumoff_t		sumoff = 0;
 	int			nmap;
 
 	for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+		union xfs_suminfo_raw *ondisk_info;
 		int		error = 0;
 
 		if (xchk_should_terminate(sc, &error))
@@ -205,22 +220,23 @@ xchk_rtsum_compare(
 		}
 
 		/* Read a block's worth of ondisk rtsummary file. */
-		error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+		error = xfs_rtsummary_read_buf(&args, off);
 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
 			return error;
 
 		/* Read a block's worth of computed rtsummary file. */
 		error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
 		if (error) {
-			xfs_trans_brelse(sc->tp, bp);
+			xfs_rtbuf_cache_relse(&args);
 			return error;
 		}
 
-		if (memcmp(bp->b_addr, sc->buf,
+		ondisk_info = xfs_rsumblock_infoptr(&args, 0);
+		if (memcmp(ondisk_info, sc->buf,
 					mp->m_blockwsize << XFS_WORDLOG) != 0)
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
 
-		xfs_trans_brelse(sc->tp, bp);
+		xfs_rtbuf_cache_relse(&args);
 		sumoff += mp->m_blockwsize;
 	}
 
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 46249e7b17e0..29afa4851235 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index cbd4d01e253c..4a8bc6f3c8f2 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1036,17 +1036,18 @@ TRACE_EVENT(xfarray_sort_stats,
 
 #ifdef CONFIG_XFS_RT
 TRACE_EVENT(xchk_rtsum_record_free,
-	TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
-		 uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
-	TP_ARGS(mp, start, len, log, pos, v),
+	TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start,
+		 xfs_rtbxlen_t len, unsigned int log, loff_t pos,
+		 xfs_suminfo_t value),
+	TP_ARGS(mp, start, len, log, pos, value),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(dev_t, rtdev)
-		__field(xfs_rtblock_t, start)
+		__field(xfs_rtxnum_t, start)
 		__field(unsigned long long, len)
 		__field(unsigned int, log)
 		__field(loff_t, pos)
-		__field(xfs_suminfo_t, v)
+		__field(xfs_suminfo_t, value)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
@@ -1055,7 +1056,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
 		__entry->len = len;
 		__entry->log = log;
 		__entry->pos = pos;
-		__entry->v = v;
+		__entry->value = value;
 	),
 	TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1064,7 +1065,7 @@ TRACE_EVENT(xchk_rtsum_record_free,
 		  __entry->len,
 		  __entry->log,
 		  __entry->pos,
-		  __entry->v)
+		  __entry->value)
 );
 #endif /* CONFIG_XFS_RT */
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fcefab687285..731260a5af6d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -28,6 +28,7 @@
 #include "xfs_icache.h"
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
+#include "xfs_rtbitmap.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -75,28 +76,28 @@ xfs_bmap_rtalloc(
 {
 	struct xfs_mount	*mp = ap->ip->i_mount;
 	xfs_fileoff_t		orig_offset = ap->offset;
-	xfs_rtblock_t		rtb;
-	xfs_extlen_t		prod = 0;  /* product factor for allocators */
+	xfs_rtxnum_t		rtx;
+	xfs_rtxlen_t		prod = 0;  /* product factor for allocators */
 	xfs_extlen_t		mod = 0;   /* product factor for allocators */
-	xfs_extlen_t		ralen = 0; /* realtime allocation length */
+	xfs_rtxlen_t		ralen = 0; /* realtime allocation length */
 	xfs_extlen_t		align;     /* minimum allocation alignment */
 	xfs_extlen_t		orig_length = ap->length;
 	xfs_extlen_t		minlen = mp->m_sb.sb_rextsize;
-	xfs_extlen_t		raminlen;
+	xfs_rtxlen_t		raminlen;
 	bool			rtlocked = false;
 	bool			ignore_locality = false;
 	int			error;
 
 	align = xfs_get_extsz_hint(ap->ip);
 retry:
-	prod = align / mp->m_sb.sb_rextsize;
+	prod = xfs_extlen_to_rtxlen(mp, align);
 	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 					align, 1, ap->eof, 0,
 					ap->conv, &ap->offset, &ap->length);
 	if (error)
 		return error;
 	ASSERT(ap->length);
-	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+	ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
 
 	/*
 	 * If we shifted the file offset downward to satisfy an extent size
@@ -116,17 +117,14 @@ retry:
 		prod = 1;
 	/*
 	 * Set ralen to be the actual requested length in rtextents.
-	 */
-	ralen = ap->length / mp->m_sb.sb_rextsize;
-	/*
+	 *
 	 * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
 	 * we rounded up to it, cut it back so it's valid again.
 	 * Note that if it's a really large request (bigger than
 	 * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
 	 * adjust the starting point to match it.
 	 */
-	if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
-		ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
+	ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
 
 	/*
 	 * Lock out modifications to both the RT bitmap and summary inodes
@@ -144,12 +142,10 @@ retry:
 	 * pick an extent that will space things out in the rt area.
 	 */
 	if (ap->eof && ap->offset == 0) {
-		xfs_rtblock_t rtx; /* realtime extent no */
-
 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
 		if (error)
 			return error;
-		ap->blkno = rtx * mp->m_sb.sb_rextsize;
+		ap->blkno = xfs_rtx_to_rtb(mp, rtx);
 	} else {
 		ap->blkno = 0;
 	}
@@ -160,20 +156,18 @@ retry:
 	 * Realtime allocation, done through xfs_rtallocate_extent.
 	 */
 	if (ignore_locality)
-		ap->blkno = 0;
+		rtx = 0;
 	else
-		do_div(ap->blkno, mp->m_sb.sb_rextsize);
-	rtb = ap->blkno;
-	ap->length = ralen;
-	raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
-	error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
-			&ralen, ap->wasdel, prod, &rtb);
+		rtx = xfs_rtb_to_rtx(mp, ap->blkno);
+	raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+	error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
+			ap->wasdel, prod, &rtx);
 	if (error)
 		return error;
 
-	if (rtb != NULLRTBLOCK) {
-		ap->blkno = rtb * mp->m_sb.sb_rextsize;
-		ap->length = ralen * mp->m_sb.sb_rextsize;
+	if (rtx != NULLRTEXTNO) {
+		ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+		ap->length = xfs_rtxlen_to_extlen(mp, ralen);
 		ap->ip->i_nblocks += ap->length;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
@@ -690,7 +684,7 @@ xfs_can_free_eofblocks(
 	 */
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
-		end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+		end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
 	if (last_fsb <= end_fsb)
 		return false;
@@ -780,12 +774,10 @@ xfs_alloc_file_space(
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
-	xfs_filblks_t		allocated_fsb;
 	xfs_filblks_t		allocatesize_fsb;
 	xfs_extlen_t		extsz, temp;
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_fileoff_t		endoffset_fsb;
-	int			nimaps;
 	int			rt;
 	xfs_trans_t		*tp;
 	xfs_bmbt_irec_t		imaps[1], *imapp;
@@ -808,7 +800,6 @@ xfs_alloc_file_space(
 
 	count = len;
 	imapp = &imaps[0];
-	nimaps = 1;
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
 	allocatesize_fsb = endoffset_fsb - startoffset_fsb;
@@ -819,6 +810,7 @@ xfs_alloc_file_space(
 	while (allocatesize_fsb && !error) {
 		xfs_fileoff_t	s, e;
 		unsigned int	dblocks, rblocks, resblks;
+		int		nimaps = 1;
 
 		/*
 		 * Determine space reservations for data/realtime.
@@ -884,15 +876,19 @@ xfs_alloc_file_space(
 		if (error)
 			break;
 
-		allocated_fsb = imapp->br_blockcount;
-
-		if (nimaps == 0) {
-			error = -ENOSPC;
-			break;
+		/*
+		 * If the allocator cannot find a single free extent large
+		 * enough to cover the start block of the requested range,
+		 * xfs_bmapi_write will return 0 but leave *nimaps set to 0.
+		 *
+		 * In that case we simply need to keep looping with the same
+		 * startoffset_fsb so that one of the following allocations
+		 * will eventually reach the requested range.
+		 */
+		if (nimaps) {
+			startoffset_fsb += imapp->br_blockcount;
+			allocatesize_fsb -= imapp->br_blockcount;
 		}
-
-		startoffset_fsb += allocated_fsb;
-		allocatesize_fsb -= allocated_fsb;
 	}
 
 	return error;
@@ -989,10 +985,8 @@ xfs_free_file_space(
 
 	/* We can only free complete realtime extents. */
 	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
-		startoffset_fsb = roundup_64(startoffset_fsb,
-					     mp->m_sb.sb_rextsize);
-		endoffset_fsb = rounddown_64(endoffset_fsb,
-					     mp->m_sb.sb_rextsize);
+		startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
+		endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
 	}
 
 	/*
@@ -1644,7 +1638,7 @@ xfs_swap_extents(
 	uint64_t		f;
 	int			resblks = 0;
 	unsigned int		flags = 0;
-	struct timespec64	ctime;
+	struct timespec64	ctime, mtime;
 
 	/*
 	 * Lock the inodes against other IO, page faults and truncate to
@@ -1758,10 +1752,11 @@ xfs_swap_extents(
 	 * under it.
 	 */
 	ctime = inode_get_ctime(VFS_I(ip));
+	mtime = inode_get_mtime(VFS_I(ip));
 	if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
 	    (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
-	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+	    (sbp->bs_mtime.tv_sec != mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
 		error = -EBUSY;
 		goto out_trans_cancel;
 	}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..545c7991b9b5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_buftarg	*btp = container_of(shrink,
-					struct xfs_buftarg, bt_shrinker);
+	struct xfs_buftarg	*btp = shrink->private_data;
 	LIST_HEAD(dispose);
 	unsigned long		freed;
 
@@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_buftarg	*btp = container_of(shrink,
-					struct xfs_buftarg, bt_shrinker);
+	struct xfs_buftarg	*btp = shrink->private_data;
 	return list_lru_shrink_count(&btp->bt_lru, sc);
 }
 
@@ -1945,17 +1943,15 @@ void
 xfs_free_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct block_device	*bdev = btp->bt_bdev;
-
-	unregister_shrinker(&btp->bt_shrinker);
+	shrinker_free(btp->bt_shrinker);
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
 	list_lru_destroy(&btp->bt_lru);
 
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
-	if (bdev != btp->bt_mount->m_super->s_bdev)
-		blkdev_put(bdev, btp->bt_mount->m_super);
+	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
+		bdev_release(btp->bt_bdev_handle);
 
 	kmem_free(btp);
 }
@@ -1990,16 +1986,15 @@ xfs_setsize_buftarg(
  */
 STATIC int
 xfs_setsize_buftarg_early(
-	xfs_buftarg_t		*btp,
-	struct block_device	*bdev)
+	xfs_buftarg_t		*btp)
 {
-	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+	return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
 }
 
 struct xfs_buftarg *
 xfs_alloc_buftarg(
 	struct xfs_mount	*mp,
-	struct block_device	*bdev)
+	struct bdev_handle	*bdev_handle)
 {
 	xfs_buftarg_t		*btp;
 	const struct dax_holder_operations *ops = NULL;
@@ -2010,9 +2005,10 @@ xfs_alloc_buftarg(
 	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
 
 	btp->bt_mount = mp;
-	btp->bt_dev =  bdev->bd_dev;
-	btp->bt_bdev = bdev;
-	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+	btp->bt_bdev_handle = bdev_handle;
+	btp->bt_dev = bdev_handle->bdev->bd_dev;
+	btp->bt_bdev = bdev_handle->bdev;
+	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
 					    mp, ops);
 
 	/*
@@ -2022,7 +2018,7 @@ xfs_alloc_buftarg(
 	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
 			     DEFAULT_RATELIMIT_BURST);
 
-	if (xfs_setsize_buftarg_early(btp, bdev))
+	if (xfs_setsize_buftarg_early(btp))
 		goto error_free;
 
 	if (list_lru_init(&btp->bt_lru))
@@ -2031,13 +2027,17 @@ xfs_alloc_buftarg(
 	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
 		goto error_lru;
 
-	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
-	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
-	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
-	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
-	if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
-			      mp->m_super->s_id))
+	btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
+					  mp->m_super->s_id);
+	if (!btp->bt_shrinker)
 		goto error_pcpu;
+
+	btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+	btp->bt_shrinker->private_data = btp;
+
+	shrinker_register(btp->bt_shrinker);
+
 	return btp;
 
 error_pcpu:
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..c86e16419656 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
  */
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
+	struct bdev_handle	*bt_bdev_handle;
 	struct block_device	*bt_bdev;
 	struct dax_device	*bt_daxdev;
 	u64			bt_dax_part_off;
@@ -108,7 +109,7 @@ typedef struct xfs_buftarg {
 	size_t			bt_logical_sectormask;
 
 	/* LRU control structures */
-	struct shrinker		bt_shrinker;
+	struct shrinker		*bt_shrinker;
 	struct list_lru		bt_lru;
 
 	struct percpu_counter	bt_io_count;
@@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
  *	Handling of buftargs.
  */
 struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
-		struct block_device *bdev);
+		struct bdev_handle *bdev_handle);
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index ac6ba646624d..a013b87ab8d5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -562,7 +562,8 @@ xfs_dquot_from_disk(
 	struct xfs_dquot	*dqp,
 	struct xfs_buf		*bp)
 {
-	struct xfs_disk_dquot	*ddqp = bp->b_addr + dqp->q_bufoffset;
+	struct xfs_dqblk	*dqb = xfs_buf_offset(bp, dqp->q_bufoffset);
+	struct xfs_disk_dquot	*ddqp = &dqb->dd_diskdq;
 
 	/*
 	 * Ensure that we got the type and ID we were looking for.
@@ -1250,7 +1251,7 @@ xfs_qm_dqflush(
 	}
 
 	/* Flush the incore dquot to the ondisk buffer. */
-	dqblk = bp->b_addr + dqp->q_bufoffset;
+	dqblk = xfs_buf_offset(bp, dqp->q_bufoffset);
 	xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
 
 	/*
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 8966ba842395..2c2720ce6923 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -19,6 +19,7 @@
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_error.h"
 
 STATIC void
 xlog_recover_dquot_ra_pass2(
@@ -65,6 +66,7 @@ xlog_recover_dquot_commit_pass2(
 {
 	struct xfs_mount		*mp = log->l_mp;
 	struct xfs_buf			*bp;
+	struct xfs_dqblk		*dqb;
 	struct xfs_disk_dquot		*ddq, *recddq;
 	struct xfs_dq_logformat		*dq_f;
 	xfs_failaddr_t			fa;
@@ -130,14 +132,14 @@ xlog_recover_dquot_commit_pass2(
 		return error;
 
 	ASSERT(bp);
-	ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+	dqb = xfs_buf_offset(bp, dq_f->qlf_boffset);
+	ddq = &dqb->dd_diskdq;
 
 	/*
 	 * If the dquot has an LSN in it, recover the dquot only if it's less
 	 * than the lsn of the transaction we are replaying.
 	 */
 	if (xfs_has_crc(mp)) {
-		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
 		xfs_lsn_t	lsn = be64_to_cpu(dqb->dd_lsn);
 
 		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
@@ -147,10 +149,23 @@ xlog_recover_dquot_commit_pass2(
 
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
 	if (xfs_has_crc(mp)) {
-		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF);
 	}
 
+	/* Validate the recovered dquot. */
+	fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id);
+	if (fa) {
+		XFS_CORRUPTION_ERROR("Bad dquot after recovery",
+				XFS_ERRLEVEL_LOW, mp, dqb,
+				sizeof(struct xfs_dqblk));
+		xfs_alert(mp,
+ "Metadata corruption detected at %pS, dquot 0x%x",
+				fa, dq_f->qlf_id);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
+
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(bp->b_mount == mp);
 	bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203700278ddb..e33e5e13b95f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -214,6 +214,43 @@ xfs_ilock_iocb(
 	return 0;
 }
 
+static int
+xfs_ilock_iocb_for_write(
+	struct kiocb		*iocb,
+	unsigned int		*lock_mode)
+{
+	ssize_t			ret;
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+
+	ret = xfs_ilock_iocb(iocb, *lock_mode);
+	if (ret)
+		return ret;
+
+	if (*lock_mode == XFS_IOLOCK_EXCL)
+		return 0;
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return 0;
+
+	xfs_iunlock(ip, *lock_mode);
+	*lock_mode = XFS_IOLOCK_EXCL;
+	return xfs_ilock_iocb(iocb, *lock_mode);
+}
+
+static unsigned int
+xfs_ilock_for_write_fault(
+	struct xfs_inode	*ip)
+{
+	/* get a shared lock if no remapping in progress */
+	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	if (!xfs_iflags_test(ip, XFS_IREMAPPING))
+		return XFS_MMAPLOCK_SHARED;
+
+	/* wait for remapping to complete */
+	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	return XFS_MMAPLOCK_EXCL;
+}
+
 STATIC ssize_t
 xfs_file_dio_read(
 	struct kiocb		*iocb,
@@ -551,7 +588,7 @@ xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 	ret = xfs_file_write_checks(iocb, from, &iolock);
@@ -618,7 +655,7 @@ retry_exclusive:
 		flags = IOMAP_DIO_FORCE_WAIT;
 	}
 
-	ret = xfs_ilock_iocb(iocb, iolock);
+	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
 
@@ -1180,7 +1217,7 @@ xfs_file_remap_range(
 	if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
 		xfs_log_force_inode(dest);
 out_unlock:
-	xfs_iunlock2_io_mmap(src, dest);
+	xfs_iunlock2_remapping(src, dest);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return remapped > 0 ? remapped : ret;
@@ -1328,6 +1365,7 @@ __xfs_filemap_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	vm_fault_t		ret;
+	unsigned int		lock_mode = 0;
 
 	trace_xfs_filemap_fault(ip, order, write_fault);
 
@@ -1336,25 +1374,24 @@ __xfs_filemap_fault(
 		file_update_time(vmf->vma->vm_file);
 	}
 
+	if (IS_DAX(inode) || write_fault)
+		lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
+
 	if (IS_DAX(inode)) {
 		pfn_t pfn;
 
-		xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 		ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
 		if (ret & VM_FAULT_NEEDDSYNC)
 			ret = dax_finish_sync_fault(vmf, order, pfn);
-		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+	} else if (write_fault) {
+		ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
 	} else {
-		if (write_fault) {
-			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-			ret = iomap_page_mkwrite(vmf,
-					&xfs_page_mkwrite_iomap_ops);
-			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-		} else {
-			ret = filemap_fault(vmf);
-		}
+		ret = filemap_fault(vmf);
 	}
 
+	if (lock_mode)
+		xfs_iunlock(XFS_I(inode), lock_mode);
+
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 736e5545f584..5a72217f5feb 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -23,7 +23,7 @@
 #include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_rtalloc.h"
+#include "xfs_rtbitmap.h"
 #include "xfs_ag.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
@@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
 	xfs_rtblock_t			rtbno;
 	xfs_daddr_t			rec_daddr, len_daddr;
 
-	rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+	rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
 	rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
 	irec.rm_startblock = rtbno;
 
-	rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize;
+	rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
 	len_daddr = XFS_FSB_TO_BB(mp, rtbno);
 	irec.rm_blockcount = rtbno;
 
@@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap(
 	uint64_t			eofs;
 	int				error;
 
-	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize);
+	eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
 	if (keys[0].fmr_physical >= eofs)
 		return 0;
 	start_rtb = XFS_BB_TO_FSBT(mp,
@@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap(
 	 * Set up query parameters to return free rtextents covering the range
 	 * we want.
 	 */
-	alow.ar_startext = start_rtb;
-	ahigh.ar_startext = end_rtb;
-	do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
-	if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
-		ahigh.ar_startext++;
+	alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
+	ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
 	error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
 			xfs_getfsmap_rtdev_rtbitmap_helper, info);
 	if (error)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3c210ac83713..dba514a2c84d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2165,8 +2165,7 @@ xfs_inodegc_shrinker_count(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
-						   m_inodegc_shrinker);
+	struct xfs_mount	*mp = shrink->private_data;
 	struct xfs_inodegc	*gc;
 	int			cpu;
 
@@ -2187,8 +2186,7 @@ xfs_inodegc_shrinker_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
-						   m_inodegc_shrinker);
+	struct xfs_mount	*mp = shrink->private_data;
 	struct xfs_inodegc	*gc;
 	int			cpu;
 	bool			no_items = true;
@@ -2224,13 +2222,19 @@ int
 xfs_inodegc_register_shrinker(
 	struct xfs_mount	*mp)
 {
-	struct shrinker		*shrink = &mp->m_inodegc_shrinker;
+	mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
+						"xfs-inodegc:%s",
+						mp->m_super->s_id);
+	if (!mp->m_inodegc_shrinker)
+		return -ENOMEM;
+
+	mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
+	mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
+	mp->m_inodegc_shrinker->seeks = 0;
+	mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
+	mp->m_inodegc_shrinker->private_data = mp;
 
-	shrink->count_objects = xfs_inodegc_shrinker_count;
-	shrink->scan_objects = xfs_inodegc_shrinker_scan;
-	shrink->seeks = 0;
-	shrink->flags = SHRINKER_NONSLAB;
-	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+	shrinker_register(mp->m_inodegc_shrinker);
 
-	return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
+	return 0;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4d55f58d99b7..c0f1c89786c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -844,8 +844,8 @@ xfs_init_new_inode(
 	ASSERT(ip->i_nblocks == 0);
 
 	tv = inode_set_ctime_current(inode);
-	inode->i_mtime = tv;
-	inode->i_atime = tv;
+	inode_set_mtime_to_ts(inode, tv);
+	inode_set_atime_to_ts(inode, tv);
 
 	ip->i_extsize = 0;
 	ip->i_diflags = 0;
@@ -918,6 +918,13 @@ xfs_droplink(
 	xfs_trans_t *tp,
 	xfs_inode_t *ip)
 {
+	if (VFS_I(ip)->i_nlink == 0) {
+		xfs_alert(ip->i_mount,
+			  "%s: Attempt to drop inode (%llu) with nlink zero.",
+			  __func__, ip->i_ino);
+		return -EFSCORRUPTED;
+	}
+
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 
 	drop_nlink(VFS_I(ip));
@@ -3621,6 +3628,23 @@ xfs_iunlock2_io_mmap(
 		inode_unlock(VFS_I(ip1));
 }
 
+/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */
+void
+xfs_iunlock2_remapping(
+	struct xfs_inode	*ip1,
+	struct xfs_inode	*ip2)
+{
+	xfs_iflags_clear(ip1, XFS_IREMAPPING);
+
+	if (ip1 != ip2)
+		xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+
+	if (ip1 != ip2)
+		inode_unlock_shared(VFS_I(ip1));
+	inode_unlock(VFS_I(ip2));
+}
+
 /*
  * Reload the incore inode list for this inode.  Caller should ensure that
  * the link count cannot change, either by taking ILOCK_SHARED or otherwise
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0c5bdb91152e..3beb470f1892 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
 /* Quotacheck is running but inode has not been added to quota counts. */
 #define XFS_IQUOTAUNCHECKED	(1 << 14)
 
+/*
+ * Remap in progress. Callers that wish to update file data while
+ * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake
+ * the lock in exclusive mode. Relocking the file will block until
+ * IREMAPPING is cleared.
+ */
+#define XFS_IREMAPPING		(1U << 15)
+
 /* All inode state flags related to inode reclaim. */
 #define XFS_ALL_IRECLAIM_FLAGS	(XFS_IRECLAIMABLE | \
 				 XFS_IRECLAIM | \
@@ -561,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip);
 extern void xfs_setup_iops(struct xfs_inode *ip);
 extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
 
+static inline void xfs_update_stable_writes(struct xfs_inode *ip)
+{
+	if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev))
+		mapping_set_stable_writes(VFS_I(ip)->i_mapping);
+	else
+		mapping_clear_stable_writes(VFS_I(ip)->i_mapping);
+}
+
 /*
  * When setting up a newly allocated inode, we need to call
  * xfs_finish_inode_setup() once the inode is fully instantiated at
@@ -595,6 +611,7 @@ void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
 
 static inline bool
 xfs_inode_unlinked_incomplete(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 127b2410eb20..cd7803fda8b1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -19,6 +19,7 @@
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_error.h"
+#include "xfs_rtbitmap.h"
 
 #include <linux/iversion.h>
 
@@ -107,7 +108,7 @@ xfs_inode_item_precommit(
 	 */
 	if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
 	    (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
-	    (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+	    xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
 		ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
 				   XFS_DIFLAG_EXTSZINHERIT);
 		ip->i_extsize = 0;
@@ -526,8 +527,8 @@ xfs_inode_to_log_dinode(
 	to->di_projid_hi = ip->i_projid >> 16;
 
 	memset(to->di_pad3, 0, sizeof(to->di_pad3));
-	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
-	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
+	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
 	to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
 	to->di_nlink = inode->i_nlink;
 	to->di_gen = inode->i_generation;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 0e5dba2343ea..144198a6b270 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
 	struct xfs_log_dinode		*ldip;
 	uint				isize;
 	int				need_free = 0;
+	xfs_failaddr_t			fa;
 
 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
 		in_f = item->ri_buf[0].i_addr;
@@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2(
 	 * superblock flag to determine whether we need to look at di_flushiter
 	 * to skip replay when the on disk inode is newer than the log one
 	 */
-	if (!xfs_has_v3inodes(mp) &&
-	    ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
-		/*
-		 * Deal with the wrap case, DI_MAX_FLUSH is less
-		 * than smaller numbers
-		 */
-		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
-		    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
-			/* do nothing */
-		} else {
-			trace_xfs_log_recover_inode_skip(log, in_f);
-			error = 0;
-			goto out_release;
+	if (!xfs_has_v3inodes(mp)) {
+		if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+			/*
+			 * Deal with the wrap case, DI_MAX_FLUSH is less
+			 * than smaller numbers
+			 */
+			if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+			    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+				/* do nothing */
+			} else {
+				trace_xfs_log_recover_inode_skip(log, in_f);
+				error = 0;
+				goto out_release;
+			}
 		}
+
+		/* Take the opportunity to reset the flush iteration count */
+		ldip->di_flushiter = 0;
 	}
 
-	/* Take the opportunity to reset the flush iteration count */
-	ldip->di_flushiter = 0;
 
 	if (unlikely(S_ISREG(ldip->di_mode))) {
 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +531,19 @@ out_owner_change:
 	    (dip->di_mode != 0))
 		error = xfs_recover_inode_owner_change(mp, dip, in_f,
 						       buffer_list);
-	/* re-generate the checksum. */
+	/* re-generate the checksum and validate the recovered inode. */
 	xfs_dinode_calc_crc(log->l_mp, dip);
+	fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+	if (fa) {
+		XFS_CORRUPTION_ERROR(
+			"Bad dinode after recovery",
+				XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+		xfs_alert(mp,
+			"Metadata corruption detected at %pS, inode 0x%llx",
+			fa, in_f->ilf_ino);
+		error = -EFSCORRUPTED;
+		goto out_release;
+	}
 
 	ASSERT(bp->b_mount == mp);
 	bp->b_flags |= _XBF_LOGRECOVERY;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 55bb01173cde..6c3919687ea6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -38,6 +38,7 @@
 #include "xfs_reflink.h"
 #include "xfs_ioctl.h"
 #include "xfs_xattr.h"
+#include "xfs_rtbitmap.h"
 
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -1004,7 +1005,7 @@ xfs_fill_fsxattr(
 		 * later.
 		 */
 		if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
-		    ip->i_extsize % mp->m_sb.sb_rextsize > 0) {
+		    xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) {
 			fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE |
 					    FS_XFLAG_EXTSZINHERIT);
 			fa->fsx_extsize = 0;
@@ -1120,23 +1121,25 @@ xfs_ioctl_setattr_xflags(
 	struct fileattr		*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
+	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
 	uint64_t		i_flags2;
 
-	/* Can't change realtime flag if any extents are allocated. */
-	if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
-	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
-		return -EINVAL;
+	if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
+		/* Can't change realtime flag if any extents are allocated. */
+		if (ip->i_df.if_nextents || ip->i_delayed_blks)
+			return -EINVAL;
+	}
 
-	/* If realtime flag is set then must have realtime device */
-	if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
+	if (rtflag) {
+		/* If realtime flag is set then must have realtime device */
 		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
-		    (ip->i_extsize % mp->m_sb.sb_rextsize))
+		    xfs_extlen_to_rtxmod(mp, ip->i_extsize))
 			return -EINVAL;
-	}
 
-	/* Clear reflink if we are actually able to set the rt flag. */
-	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
-		ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+		/* Clear reflink if we are actually able to set the rt flag. */
+		if (xfs_is_reflink_inode(ip))
+			ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+	}
 
 	/* diflags2 only valid for v3 inodes. */
 	i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
@@ -1147,6 +1150,14 @@ xfs_ioctl_setattr_xflags(
 	ip->i_diflags2 = i_flags2;
 
 	xfs_diflags_to_iflags(ip, false);
+
+	/*
+	 * Make the stable writes flag match that of the device the inode
+	 * resides on when flipping the RT flag.
+	 */
+	if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode))
+		xfs_update_stable_writes(ip);
+
 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	XFS_STATS_INC(mp, xs_ig_attrchg);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index c14852362fce..052d0e888c27 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -22,7 +22,7 @@
 /*
  * On intel, even if sizes match, alignment and/or padding may differ.
  */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #define BROKEN_X86_ALIGNMENT
 #define __compat_packed __attribute__((packed))
 #else
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b3b05c28e9e..a0d77f5f512e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -572,8 +572,8 @@ xfs_vn_getattr(
 	stat->uid = vfsuid_into_kuid(vfsuid);
 	stat->gid = vfsgid_into_kgid(vfsgid);
 	stat->ino = ip->i_ino;
-	stat->atime = inode->i_atime;
-	stat->mtime = inode->i_mtime;
+	stat->atime = inode_get_atime(inode);
+	stat->mtime = inode_get_mtime(inode);
 	stat->ctime = inode_get_ctime(inode);
 	stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
 
@@ -1067,9 +1067,9 @@ xfs_vn_update_time(
 		now = current_time(inode);
 
 	if (flags & S_MTIME)
-		inode->i_mtime = now;
+		inode_set_mtime_to_ts(inode, now);
 	if (flags & S_ATIME)
-		inode->i_atime = now;
+		inode_set_atime_to_ts(inode, now);
 
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, log_flags);
@@ -1299,6 +1299,13 @@ xfs_setup_inode(
 	mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
 
 	/*
+	 * For real-time inodes update the stable write flags to that of the RT
+	 * device instead of the data device.
+	 */
+	if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip))
+		xfs_update_stable_writes(ip);
+
+	/*
 	 * If there is no attribute fork no ACL can exist on this inode,
 	 * and it can't have any file capabilities attached to it either.
 	 */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f5377ba5967a..14462614fcc8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -107,12 +107,12 @@ xfs_bulkstat_one_int(
 	buf->bs_size = ip->i_disk_size;
 
 	buf->bs_nlink = inode->i_nlink;
-	buf->bs_atime = inode->i_atime.tv_sec;
-	buf->bs_atime_nsec = inode->i_atime.tv_nsec;
-	buf->bs_mtime = inode->i_mtime.tv_sec;
-	buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
-	buf->bs_ctime = inode_get_ctime(inode).tv_sec;
-	buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
+	buf->bs_atime = inode_get_atime_sec(inode);
+	buf->bs_atime_nsec = inode_get_atime_nsec(inode);
+	buf->bs_mtime = inode_get_mtime_sec(inode);
+	buf->bs_mtime_nsec = inode_get_mtime_nsec(inode);
+	buf->bs_ctime = inode_get_ctime_sec(inode);
+	buf->bs_ctime_nsec = inode_get_ctime_nsec(inode);
 	buf->bs_gen = inode->i_generation;
 	buf->bs_mode = inode->i_mode;
 
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e9d317a3dafe..d7873e0360f0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
 	return x;
 }
 
+/* If @b is a power of 2, return log2(b).  Else return -1. */
+static inline int8_t log2_if_power2(unsigned long b)
+{
+	return is_power_of_2(b) ? ilog2(b) : -1;
+}
+
+/* If @b is a power of 2, return a mask of the lower bits, else return zero. */
+static inline unsigned long long mask64_if_power2(unsigned long b)
+{
+	return is_power_of_2(b) ? b - 1 : 0;
+}
+
 int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
 		char *data, enum req_op op);
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 51c100c86177..ee206facf0dc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
 		 * the buffer manually, the code needs to be kept in sync
 		 * with the I/O completion path.
 		 */
-		xlog_state_done_syncing(iclog);
-		up(&iclog->ic_sema);
-		return;
+		goto sync;
 	}
 
 	/*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
 		 * avoid shutdown re-entering this path and erroring out again.
 		 */
 		if (log->l_targ != log->l_mp->m_ddev_targp &&
-		    blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
-			xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-			return;
-		}
+		    blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+			goto shutdown;
 	}
 	if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
 		iclog->ic_bio.bi_opf |= REQ_FUA;
 
 	iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
 
-	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
-		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-		return;
-	}
+	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+		goto shutdown;
+
 	if (is_vmalloc_addr(iclog->ic_data))
 		flush_kernel_vmap_range(iclog->ic_data, count);
 
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
 	}
 
 	submit_bio(&iclog->ic_bio);
+	return;
+shutdown:
+	xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+	xlog_state_done_syncing(iclog);
+	up(&iclog->ic_sema);
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 13b94d2e605b..a1e18b24971a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
 
 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
 		list_del_init(&dfc->dfc_list);
-		xfs_defer_ops_capture_free(mp, dfc);
+		xfs_defer_ops_capture_abort(mp, dfc);
 	}
 }
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0a0fd19573d8..aed5be5508fe 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1021,7 +1021,7 @@ xfs_mountfs(
  out_log_dealloc:
 	xfs_log_mount_cancel(mp);
  out_inodegc_shrinker:
-	unregister_shrinker(&mp->m_inodegc_shrinker);
+	shrinker_free(mp->m_inodegc_shrinker);
  out_fail_wait:
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
 		xfs_buftarg_drain(mp->m_logdev_targp);
@@ -1104,7 +1104,7 @@ xfs_unmountfs(
 #if defined(DEBUG)
 	xfs_errortag_clearall(mp);
 #endif
-	unregister_shrinker(&mp->m_inodegc_shrinker);
+	shrinker_free(mp->m_inodegc_shrinker);
 	xfs_free_perag(mp);
 
 	xfs_errortag_del(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d19cca099bc3..503fe3c7edbf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -101,9 +101,9 @@ typedef struct xfs_mount {
 
 	/*
 	 * Optional cache of rt summary level per bitmap block with the
-	 * invariant that m_rsum_cache[bbno] <= the minimum i for which
-	 * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
-	 * inode lock.
+	 * invariant that m_rsum_cache[bbno] > the maximum i for which
+	 * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i.
+	 * Reads and writes are serialized by the rsumip inode lock.
 	 */
 	uint8_t			*m_rsum_cache;
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
@@ -119,6 +119,7 @@ typedef struct xfs_mount {
 	uint8_t			m_blkbb_log;	/* blocklog - BBSHIFT */
 	uint8_t			m_agno_log;	/* log #ag's */
 	uint8_t			m_sectbb_log;	/* sectlog - BBSHIFT */
+	int8_t			m_rtxblklog;	/* log2 of rextsize, if possible */
 	uint			m_blockmask;	/* sb_blocksize-1 */
 	uint			m_blockwsize;	/* sb_blocksize in words */
 	uint			m_blockwmask;	/* blockwsize-1 */
@@ -152,6 +153,7 @@ typedef struct xfs_mount {
 	uint64_t		m_features;	/* active filesystem features */
 	uint64_t		m_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_low_rtexts[XFS_LOWSP_MAX];
+	uint64_t		m_rtxblkmask;	/* rt extent block mask */
 	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
 	struct xfs_trans_resv	m_resv;		/* precomputed res values */
 						/* low free space thresholds */
@@ -219,7 +221,7 @@ typedef struct xfs_mount {
 	atomic_t		m_agirotor;	/* last ag dir inode alloced */
 
 	/* Memory shrinker to throttle and reprioritize inodegc */
-	struct shrinker		m_inodegc_shrinker;
+	struct shrinker		*m_inodegc_shrinker;
 	/*
 	 * Workqueue item so that we can coalesce multiple inode flush attempts
 	 * into a single flush.
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index c4cc99b70dd3..21a7e350b4c5 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t,	4);
 
+	/* realtime structures */
+	XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw,		4);
+	XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw,		4);
+
 	/*
 	 * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
 	 * 4 bytes anyway so it's not obviously a problem.  Hence for the moment
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 086e78a6143a..94a7932ac570 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -504,8 +504,7 @@ xfs_qm_shrink_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_quotainfo	*qi = container_of(shrink,
-					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_quotainfo	*qi = shrink->private_data;
 	struct xfs_qm_isolate	isol;
 	unsigned long		freed;
 	int			error;
@@ -539,8 +538,7 @@ xfs_qm_shrink_count(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
-	struct xfs_quotainfo	*qi = container_of(shrink,
-					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_quotainfo	*qi = shrink->private_data;
 
 	return list_lru_shrink_count(&qi->qi_lru, sc);
 }
@@ -680,15 +678,18 @@ xfs_qm_init_quotainfo(
 	if (XFS_IS_PQUOTA_ON(mp))
 		xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
 
-	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
-	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
-	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
-	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
-
-	error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
-				  mp->m_super->s_id);
-	if (error)
+	qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s",
+					   mp->m_super->s_id);
+	if (!qinf->qi_shrinker) {
+		error = -ENOMEM;
 		goto out_free_inos;
+	}
+
+	qinf->qi_shrinker->count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan;
+	qinf->qi_shrinker->private_data = qinf;
+
+	shrinker_register(qinf->qi_shrinker);
 
 	return 0;
 
@@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo(
 	qi = mp->m_quotainfo;
 	ASSERT(qi != NULL);
 
-	unregister_shrinker(&qi->qi_shrinker);
+	shrinker_free(qi->qi_shrinker);
 	list_lru_destroy(&qi->qi_lru);
 	xfs_qm_destroy_quotainos(qi);
 	mutex_destroy(&qi->qi_tree_lock);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9683f0457d19..d5c9fc4ba591 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -63,7 +63,7 @@ struct xfs_quotainfo {
 	struct xfs_def_quota	qi_usr_default;
 	struct xfs_def_quota	qi_grp_default;
 	struct xfs_def_quota	qi_prj_default;
-	struct shrinker		qi_shrinker;
+	struct shrinker		*qi_shrinker;
 
 	/* Minimum and maximum quota expiration timestamp values. */
 	time64_t		qi_expiry_min;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index eb9102453aff..e5b62dc28466 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
 		}
 	}
 	del = got;
+	xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
 
 	/* Grab the corresponding mapping in the data fork. */
 	nmaps = 1;
@@ -1540,6 +1541,10 @@ xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	xfs_iflags_set(src, XFS_IREMAPPING);
+	if (inode_in != inode_out)
+		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
 	return 0;
 out_unlock:
 	xfs_iunlock2_io_mmap(src, dest);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..88c48de5c9c8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -19,6 +19,7 @@
 #include "xfs_icache.h"
 #include "xfs_rtalloc.h"
 #include "xfs_sb.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Read and return the summary information for a given extent size,
@@ -28,48 +29,48 @@
  */
 static int
 xfs_rtget_summary(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		log,		/* log2 of extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+	struct xfs_rtalloc_args	*args,
+	int			log,	/* log2 of extent size */
+	xfs_fileoff_t		bbno,	/* bitmap block number */
+	xfs_suminfo_t		*sum)	/* out: summary info for this block */
 {
-	return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
+	return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
 }
 
 /*
  * Return whether there are any free extents in the size range given
  * by low and high, for the bitmap block bbno.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtany_summary(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	int		low,		/* low log2 extent size */
-	int		high,		/* high log2 extent size */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	int		*stat)		/* out: any good extents here? */
+	struct xfs_rtalloc_args	*args,
+	int			low,	/* low log2 extent size */
+	int			high,	/* high log2 extent size */
+	xfs_fileoff_t		bbno,	/* bitmap block number */
+	int			*maxlog) /* out: max log2 extent size free */
 {
-	int		error;		/* error value */
-	int		log;		/* loop counter, log2 of ext. size */
-	xfs_suminfo_t	sum;		/* summary data */
-
-	/* There are no extents at levels < m_rsum_cache[bbno]. */
-	if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
-		low = mp->m_rsum_cache[bbno];
+	struct xfs_mount	*mp = args->mp;
+	int			error;
+	int			log;	/* loop counter, log2 of ext. size */
+	xfs_suminfo_t		sum;	/* summary data */
+
+	/* There are no extents at levels >= m_rsum_cache[bbno]. */
+	if (mp->m_rsum_cache) {
+		high = min(high, mp->m_rsum_cache[bbno] - 1);
+		if (low > high) {
+			*maxlog = -1;
+			return 0;
+		}
+	}
 
 	/*
 	 * Loop over logs of extent sizes.
 	 */
-	for (log = low; log <= high; log++) {
+	for (log = high; log >= low; log--) {
 		/*
 		 * Get one summary datum.
 		 */
-		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		error = xfs_rtget_summary(args, log, bbno, &sum);
 		if (error) {
 			return error;
 		}
@@ -77,18 +78,18 @@ xfs_rtany_summary(
 		 * If there are any, return success.
 		 */
 		if (sum) {
-			*stat = 1;
+			*maxlog = log;
 			goto out;
 		}
 	}
 	/*
 	 * Found nothing, return failure.
 	 */
-	*stat = 0;
+	*maxlog = -1;
 out:
-	/* There were no extents at levels < log. */
-	if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
-		mp->m_rsum_cache[bbno] = log;
+	/* There were no extents at levels > log. */
+	if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno])
+		mp->m_rsum_cache[bbno] = log + 1;
 	return 0;
 }
 
@@ -97,60 +98,54 @@ out:
  * Copy and transform the summary file, given the old and new
  * parameters in the mount structures.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtcopy_summary(
-	xfs_mount_t	*omp,		/* old file system mount point */
-	xfs_mount_t	*nmp,		/* new file system mount point */
-	xfs_trans_t	*tp)		/* transaction pointer */
+	struct xfs_rtalloc_args	*oargs,
+	struct xfs_rtalloc_args	*nargs)
 {
-	xfs_rtblock_t	bbno;		/* bitmap block number */
-	struct xfs_buf	*bp;		/* summary buffer */
-	int		error;		/* error return value */
-	int		log;		/* summary level number (log length) */
-	xfs_suminfo_t	sum;		/* summary data */
-	xfs_fsblock_t	sumbno;		/* summary block number */
+	xfs_fileoff_t		bbno;	/* bitmap block number */
+	int			error;
+	int			log;	/* summary level number (log length) */
+	xfs_suminfo_t		sum;	/* summary data */
 
-	bp = NULL;
-	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
-		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+	for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1;
 		     (xfs_srtblock_t)bbno >= 0;
 		     bbno--) {
-			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
-				&sumbno, &sum);
+			error = xfs_rtget_summary(oargs, log, bbno, &sum);
 			if (error)
-				return error;
+				goto out;
 			if (sum == 0)
 				continue;
-			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
-				&bp, &sumbno);
+			error = xfs_rtmodify_summary(oargs, log, bbno, -sum);
 			if (error)
-				return error;
-			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
-				&bp, &sumbno);
+				goto out;
+			error = xfs_rtmodify_summary(nargs, log, bbno, sum);
 			if (error)
-				return error;
+				goto out;
 			ASSERT(sum > 0);
 		}
 	}
+	error = 0;
+out:
+	xfs_rtbuf_cache_relse(oargs);
 	return 0;
 }
 /*
  * Mark an extent specified by start and len allocated.
  * Updates all the summary information as well as the bitmap.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtallocate_range(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	start,		/* start block to allocate */
-	xfs_extlen_t	len,		/* length to allocate */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* start rtext to allocate */
+	xfs_rtxlen_t		len)	/* in/out: summary block number */
 {
-	xfs_rtblock_t	end;		/* end of the allocated extent */
-	int		error;		/* error value */
-	xfs_rtblock_t	postblock = 0;	/* first block allocated > end */
-	xfs_rtblock_t	preblock = 0;	/* first block allocated < start */
+	struct xfs_mount	*mp = args->mp;
+	xfs_rtxnum_t		end;	/* end of the allocated rtext */
+	int			error;
+	xfs_rtxnum_t		postblock = 0; /* first rtext allocated > end */
+	xfs_rtxnum_t		preblock = 0; /* first rtext allocated < start */
 
 	end = start + len - 1;
 	/*
@@ -158,15 +153,15 @@ xfs_rtallocate_range(
 	 * We need to find the beginning and end of the extent so we can
 	 * properly update the summary.
 	 */
-	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	error = xfs_rtfind_back(args, start, 0, &preblock);
 	if (error) {
 		return error;
 	}
 	/*
 	 * Find the next allocated block (end of free extent).
 	 */
-	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
-		&postblock);
+	error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+			&postblock);
 	if (error) {
 		return error;
 	}
@@ -174,9 +169,9 @@ xfs_rtallocate_range(
 	 * Decrement the summary information corresponding to the entire
 	 * (old) free extent.
 	 */
-	error = xfs_rtmodify_summary(mp, tp,
-		XFS_RTBLOCKLOG(postblock + 1 - preblock),
-		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	error = xfs_rtmodify_summary(args,
+			XFS_RTBLOCKLOG(postblock + 1 - preblock),
+			xfs_rtx_to_rbmblock(mp, preblock), -1);
 	if (error) {
 		return error;
 	}
@@ -185,9 +180,9 @@ xfs_rtallocate_range(
 	 * old extent, add summary data for them to be free.
 	 */
 	if (preblock < start) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(start - preblock),
-			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		error = xfs_rtmodify_summary(args,
+				XFS_RTBLOCKLOG(start - preblock),
+				xfs_rtx_to_rbmblock(mp, preblock), 1);
 		if (error) {
 			return error;
 		}
@@ -197,9 +192,9 @@ xfs_rtallocate_range(
 	 * old extent, add summary data for them to be free.
 	 */
 	if (postblock > end) {
-		error = xfs_rtmodify_summary(mp, tp,
-			XFS_RTBLOCKLOG(postblock - end),
-			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		error = xfs_rtmodify_summary(args,
+				XFS_RTBLOCKLOG(postblock - end),
+				xfs_rtx_to_rbmblock(mp, end + 1), 1);
 		if (error) {
 			return error;
 		}
@@ -207,54 +202,69 @@ xfs_rtallocate_range(
 	/*
 	 * Modify the bitmap to mark this extent allocated.
 	 */
-	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	error = xfs_rtmodify_range(args, start, len, 0);
 	return error;
 }
 
 /*
+ * Make sure we don't run off the end of the rt volume.  Be careful that
+ * adjusting maxlen downwards doesn't cause us to fail the alignment checks.
+ */
+static inline xfs_rtxlen_t
+xfs_rtallocate_clamp_len(
+	struct xfs_mount	*mp,
+	xfs_rtxnum_t		startrtx,
+	xfs_rtxlen_t		rtxlen,
+	xfs_rtxlen_t		prod)
+{
+	xfs_rtxlen_t		ret;
+
+	ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
+	return rounddown(ret, prod);
+}
+
+/*
  * Attempt to allocate an extent minlen<=len<=maxlen starting from
  * bitmap block bbno.  If we don't get maxlen then use prod to trim
- * the length, if given.  Returns error; returns starting block in *rtblock.
+ * the length, if given.  Returns error; returns starting block in *rtx.
  * The lengths are all in rtextents.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtallocate_extent_block(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bbno,		/* bitmap block number */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	xfs_rtblock_t	*nextp,		/* out: next block to try */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	struct xfs_rtalloc_args	*args,
+	xfs_fileoff_t		bbno,	/* bitmap block number */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxnum_t		*nextp,	/* out: next rtext to try */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
 {
-	xfs_rtblock_t	besti;		/* best rtblock found so far */
-	xfs_rtblock_t	bestlen;	/* best length found so far */
-	xfs_rtblock_t	end;		/* last rtblock in chunk */
-	int		error;		/* error value */
-	xfs_rtblock_t	i;		/* current rtblock trying */
-	xfs_rtblock_t	next;		/* next rtblock to try */
-	int		stat;		/* status from internal calls */
+	struct xfs_mount	*mp = args->mp;
+	xfs_rtxnum_t		besti;	/* best rtext found so far */
+	xfs_rtxnum_t		bestlen;/* best length found so far */
+	xfs_rtxnum_t		end;	/* last rtext in chunk */
+	int			error;
+	xfs_rtxnum_t		i;	/* current rtext trying */
+	xfs_rtxnum_t		next;	/* next rtext to try */
+	int			stat;	/* status from internal calls */
 
 	/*
 	 * Loop over all the extents starting in this bitmap block,
 	 * looking for one that's long enough.
 	 */
-	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
-		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0,
+		end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1;
 	     i <= end;
 	     i++) {
 		/* Make sure we don't scan off the end of the rt volume. */
-		maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i;
+		maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
 
 		/*
 		 * See if there's a free extent of maxlen starting at i.
 		 * If it's not so then next will contain the first non-free.
 		 */
-		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
 		if (error) {
 			return error;
 		}
@@ -262,13 +272,12 @@ xfs_rtallocate_extent_block(
 			/*
 			 * i for maxlen is all free, allocate and return that.
 			 */
-			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
-				rsb);
+			error = xfs_rtallocate_range(args, i, maxlen);
 			if (error) {
 				return error;
 			}
 			*len = maxlen;
-			*rtblock = i;
+			*rtx = i;
 			return 0;
 		}
 		/*
@@ -278,7 +287,7 @@ xfs_rtallocate_extent_block(
 		 * so far, remember it.
 		 */
 		if (minlen < maxlen) {
-			xfs_rtblock_t	thislen;	/* this extent size */
+			xfs_rtxnum_t	thislen;	/* this extent size */
 
 			thislen = next - i;
 			if (thislen >= minlen && thislen > bestlen) {
@@ -290,7 +299,7 @@ xfs_rtallocate_extent_block(
 		 * If not done yet, find the start of the next free space.
 		 */
 		if (next < end) {
-			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			error = xfs_rtfind_forw(args, next, end, &i);
 			if (error) {
 				return error;
 			}
@@ -301,7 +310,7 @@ xfs_rtallocate_extent_block(
 	 * Searched the whole thing & didn't find a maxlen free extent.
 	 */
 	if (minlen < maxlen && besti != -1) {
-		xfs_extlen_t	p;	/* amount to trim length by */
+		xfs_rtxlen_t	p;	/* amount to trim length by */
 
 		/*
 		 * If size should be a multiple of prod, make that so.
@@ -315,51 +324,49 @@ xfs_rtallocate_extent_block(
 		/*
 		 * Allocate besti for bestlen & return that.
 		 */
-		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		error = xfs_rtallocate_range(args, besti, bestlen);
 		if (error) {
 			return error;
 		}
 		*len = bestlen;
-		*rtblock = besti;
+		*rtx = besti;
 		return 0;
 	}
 	/*
 	 * Allocation failed.  Set *nextp to the next block to try.
 	 */
 	*nextp = next;
-	*rtblock = NULLRTBLOCK;
+	*rtx = NULLRTEXTNO;
 	return 0;
 }
 
 /*
  * Allocate an extent of length minlen<=len<=maxlen, starting at block
  * bno.  If we don't get maxlen then use prod to trim the length, if given.
- * Returns error; returns starting block in *rtblock.
+ * Returns error; returns starting block in *rtx.
  * The lengths are all in rtextents.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtallocate_extent_exact(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to allocate */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
 {
-	int		error;		/* error value */
-	xfs_extlen_t	i;		/* extent length trimmed due to prod */
-	int		isfree;		/* extent is free */
-	xfs_rtblock_t	next;		/* next block to try (dummy) */
+	int			error;
+	xfs_rtxlen_t		i;	/* extent length trimmed due to prod */
+	int			isfree;	/* extent is free */
+	xfs_rtxnum_t		next;	/* next rtext to try (dummy) */
 
-	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	ASSERT(minlen % prod == 0);
+	ASSERT(maxlen % prod == 0);
 	/*
 	 * Check if the range in question (for maxlen) is free.
 	 */
-	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
 	if (error) {
 		return error;
 	}
@@ -367,23 +374,23 @@ xfs_rtallocate_extent_exact(
 		/*
 		 * If it is, allocate it and return success.
 		 */
-		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		error = xfs_rtallocate_range(args, start, maxlen);
 		if (error) {
 			return error;
 		}
 		*len = maxlen;
-		*rtblock = bno;
+		*rtx = start;
 		return 0;
 	}
 	/*
 	 * If not, allocate what there is, if it's at least minlen.
 	 */
-	maxlen = next - bno;
+	maxlen = next - start;
 	if (maxlen < minlen) {
 		/*
 		 * Failed, return failure status.
 		 */
-		*rtblock = NULLRTBLOCK;
+		*rtx = NULLRTEXTNO;
 		return 0;
 	}
 	/*
@@ -395,81 +402,82 @@ xfs_rtallocate_extent_exact(
 			/*
 			 * Now we can't do it, return failure status.
 			 */
-			*rtblock = NULLRTBLOCK;
+			*rtx = NULLRTEXTNO;
 			return 0;
 		}
 	}
 	/*
 	 * Allocate what we can and return it.
 	 */
-	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	error = xfs_rtallocate_range(args, start, maxlen);
 	if (error) {
 		return error;
 	}
 	*len = maxlen;
-	*rtblock = bno;
+	*rtx = start;
 	return 0;
 }
 
 /*
  * Allocate an extent of length minlen<=len<=maxlen, starting as near
- * to bno as possible.  If we don't get maxlen then use prod to trim
+ * to start as possible.  If we don't get maxlen then use prod to trim
  * the length, if given.  The lengths are all in rtextents.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtallocate_extent_near(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to allocate */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
 {
-	int		any;		/* any useful extents from summary */
-	xfs_rtblock_t	bbno;		/* bitmap block number */
-	int		error;		/* error value */
-	int		i;		/* bitmap block offset (loop control) */
-	int		j;		/* secondary loop control */
-	int		log2len;	/* log2 of minlen */
-	xfs_rtblock_t	n;		/* next block to try */
-	xfs_rtblock_t	r;		/* result block */
-
-	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	struct xfs_mount	*mp = args->mp;
+	int			maxlog;	/* max useful extent from summary */
+	xfs_fileoff_t		bbno;	/* bitmap block number */
+	int			error;
+	int			i;	/* bitmap block offset (loop control) */
+	int			j;	/* secondary loop control */
+	int			log2len; /* log2 of minlen */
+	xfs_rtxnum_t		n;	/* next rtext to try */
+	xfs_rtxnum_t		r;	/* result rtext */
+
+	ASSERT(minlen % prod == 0);
+	ASSERT(maxlen % prod == 0);
+
 	/*
 	 * If the block number given is off the end, silently set it to
 	 * the last block.
 	 */
-	if (bno >= mp->m_sb.sb_rextents)
-		bno = mp->m_sb.sb_rextents - 1;
+	if (start >= mp->m_sb.sb_rextents)
+		start = mp->m_sb.sb_rextents - 1;
 
 	/* Make sure we don't run off the end of the rt volume. */
-	maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno;
+	maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
 	if (maxlen < minlen) {
-		*rtblock = NULLRTBLOCK;
+		*rtx = NULLRTEXTNO;
 		return 0;
 	}
 
 	/*
 	 * Try the exact allocation first.
 	 */
-	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
-		rbpp, rsb, prod, &r);
+	error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
+			prod, &r);
 	if (error) {
 		return error;
 	}
 	/*
 	 * If the exact allocation worked, return that.
 	 */
-	if (r != NULLRTBLOCK) {
-		*rtblock = r;
+	if (r != NULLRTEXTNO) {
+		*rtx = r;
 		return 0;
 	}
-	bbno = XFS_BITTOBLOCK(mp, bno);
+	bbno = xfs_rtx_to_rbmblock(mp, start);
 	i = 0;
+	j = -1;
 	ASSERT(minlen != 0);
 	log2len = xfs_highbit32(minlen);
 	/*
@@ -480,8 +488,8 @@ xfs_rtallocate_extent_near(
 		 * Get summary information of extents of all useful levels
 		 * starting in this bitmap block.
 		 */
-		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
-			bbno + i, rbpp, rsb, &any);
+		error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
+				bbno + i, &maxlog);
 		if (error) {
 			return error;
 		}
@@ -489,7 +497,10 @@ xfs_rtallocate_extent_near(
 		 * If there are any useful extents starting here, try
 		 * allocating one.
 		 */
-		if (any) {
+		if (maxlog >= 0) {
+			xfs_extlen_t maxavail =
+				min_t(xfs_rtblock_t, maxlen,
+				      (1ULL << (maxlog + 1)) - 1);
 			/*
 			 * On the positive side of the starting location.
 			 */
@@ -498,17 +509,17 @@ xfs_rtallocate_extent_near(
 				 * Try to allocate an extent starting in
 				 * this block.
 				 */
-				error = xfs_rtallocate_extent_block(mp, tp,
-					bbno + i, minlen, maxlen, len, &n, rbpp,
-					rsb, prod, &r);
+				error = xfs_rtallocate_extent_block(args,
+						bbno + i, minlen, maxavail, len,
+						&n, prod, &r);
 				if (error) {
 					return error;
 				}
 				/*
 				 * If it worked, return it.
 				 */
-				if (r != NULLRTBLOCK) {
-					*rtblock = r;
+				if (r != NULLRTEXTNO) {
+					*rtx = r;
 					return 0;
 				}
 			}
@@ -516,68 +527,46 @@ xfs_rtallocate_extent_near(
 			 * On the negative side of the starting location.
 			 */
 			else {		/* i < 0 */
+				int	maxblocks;
+
 				/*
-				 * Loop backwards through the bitmap blocks from
-				 * the starting point-1 up to where we are now.
-				 * There should be an extent which ends in this
-				 * bitmap block and is long enough.
+				 * Loop backwards to find the end of the extent
+				 * we found in the realtime summary.
+				 *
+				 * maxblocks is the maximum possible number of
+				 * bitmap blocks from the start of the extent
+				 * to the end of the extent.
 				 */
-				for (j = -1; j > i; j--) {
-					/*
-					 * Grab the summary information for
-					 * this bitmap block.
-					 */
-					error = xfs_rtany_summary(mp, tp,
-						log2len, mp->m_rsumlevels - 1,
-						bbno + j, rbpp, rsb, &any);
-					if (error) {
-						return error;
-					}
-					/*
-					 * If there's no extent given in the
-					 * summary that means the extent we
-					 * found must carry over from an
-					 * earlier block.  If there is an
-					 * extent given, we've already tried
-					 * that allocation, don't do it again.
-					 */
-					if (any)
-						continue;
-					error = xfs_rtallocate_extent_block(mp,
-						tp, bbno + j, minlen, maxlen,
-						len, &n, rbpp, rsb, prod, &r);
+				if (maxlog == 0)
+					maxblocks = 0;
+				else if (maxlog < mp->m_blkbit_log)
+					maxblocks = 1;
+				else
+					maxblocks = 2 << (maxlog - mp->m_blkbit_log);
+
+				/*
+				 * We need to check bbno + i + maxblocks down to
+				 * bbno + i. We already checked bbno down to
+				 * bbno + j + 1, so we don't need to check those
+				 * again.
+				 */
+				j = min(i + maxblocks, j);
+				for (; j >= i; j--) {
+					error = xfs_rtallocate_extent_block(args,
+							bbno + j, minlen,
+							maxavail, len, &n, prod,
+							&r);
 					if (error) {
 						return error;
 					}
 					/*
 					 * If it works, return the extent.
 					 */
-					if (r != NULLRTBLOCK) {
-						*rtblock = r;
+					if (r != NULLRTEXTNO) {
+						*rtx = r;
 						return 0;
 					}
 				}
-				/*
-				 * There weren't intervening bitmap blocks
-				 * with a long enough extent, or the
-				 * allocation didn't work for some reason
-				 * (i.e. it's a little * too short).
-				 * Try to allocate from the summary block
-				 * that we found.
-				 */
-				error = xfs_rtallocate_extent_block(mp, tp,
-					bbno + i, minlen, maxlen, len, &n, rbpp,
-					rsb, prod, &r);
-				if (error) {
-					return error;
-				}
-				/*
-				 * If it works, return the extent.
-				 */
-				if (r != NULLRTBLOCK) {
-					*rtblock = r;
-					return 0;
-				}
 			}
 		}
 		/*
@@ -610,7 +599,7 @@ xfs_rtallocate_extent_near(
 		else
 			break;
 	}
-	*rtblock = NULLRTBLOCK;
+	*rtx = NULLRTEXTNO;
 	return 0;
 }
 
@@ -619,26 +608,25 @@ xfs_rtallocate_extent_near(
  * specified.  If we don't get maxlen then use prod to trim
  * the length, if given.  The lengths are all in rtextents.
  */
-STATIC int				/* error */
+STATIC int
 xfs_rtallocate_extent_size(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	struct xfs_buf	**rbpp,		/* in/out: summary block buffer */
-	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	struct xfs_rtalloc_args	*args,
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtx)	/* out: start rtext allocated */
 {
-	int		error;		/* error value */
-	int		i;		/* bitmap block number */
-	int		l;		/* level number (loop control) */
-	xfs_rtblock_t	n;		/* next block to be tried */
-	xfs_rtblock_t	r;		/* result block number */
-	xfs_suminfo_t	sum;		/* summary information for extents */
-
-	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	struct xfs_mount	*mp = args->mp;
+	int			error;
+	xfs_fileoff_t		i;	/* bitmap block number */
+	int			l;	/* level number (loop control) */
+	xfs_rtxnum_t		n;	/* next rtext to be tried */
+	xfs_rtxnum_t		r;	/* result rtext number */
+	xfs_suminfo_t		sum;	/* summary information for extents */
+
+	ASSERT(minlen % prod == 0);
+	ASSERT(maxlen % prod == 0);
 	ASSERT(maxlen != 0);
 
 	/*
@@ -656,8 +644,7 @@ xfs_rtallocate_extent_size(
 			/*
 			 * Get the summary for this level/block.
 			 */
-			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
-				&sum);
+			error = xfs_rtget_summary(args, l, i, &sum);
 			if (error) {
 				return error;
 			}
@@ -669,16 +656,16 @@ xfs_rtallocate_extent_size(
 			/*
 			 * Try allocating the extent.
 			 */
-			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
-				maxlen, len, &n, rbpp, rsb, prod, &r);
+			error = xfs_rtallocate_extent_block(args, i, maxlen,
+					maxlen, len, &n, prod, &r);
 			if (error) {
 				return error;
 			}
 			/*
 			 * If it worked, return that.
 			 */
-			if (r != NULLRTBLOCK) {
-				*rtblock = r;
+			if (r != NULLRTEXTNO) {
+				*rtx = r;
 				return 0;
 			}
 			/*
@@ -686,8 +673,8 @@ xfs_rtallocate_extent_size(
 			 * allocator is beyond the next bitmap block,
 			 * skip to that bitmap block.
 			 */
-			if (XFS_BITTOBLOCK(mp, n) > i + 1)
-				i = XFS_BITTOBLOCK(mp, n) - 1;
+			if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+				i = xfs_rtx_to_rbmblock(mp, n) - 1;
 		}
 	}
 	/*
@@ -695,7 +682,7 @@ xfs_rtallocate_extent_size(
 	 * we're asking for a fixed size extent.
 	 */
 	if (minlen > --maxlen) {
-		*rtblock = NULLRTBLOCK;
+		*rtx = NULLRTEXTNO;
 		return 0;
 	}
 	ASSERT(minlen != 0);
@@ -715,8 +702,7 @@ xfs_rtallocate_extent_size(
 			/*
 			 * Get the summary information for this level/block.
 			 */
-			error =	xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
-						  &sum);
+			error =	xfs_rtget_summary(args, l, i, &sum);
 			if (error) {
 				return error;
 			}
@@ -730,18 +716,18 @@ xfs_rtallocate_extent_size(
 			 * minlen/maxlen are in the possible range for
 			 * this summary level.
 			 */
-			error = xfs_rtallocate_extent_block(mp, tp, i,
+			error = xfs_rtallocate_extent_block(args, i,
 					XFS_RTMAX(minlen, 1 << l),
 					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
-					len, &n, rbpp, rsb, prod, &r);
+					len, &n, prod, &r);
 			if (error) {
 				return error;
 			}
 			/*
 			 * If it worked, return that extent.
 			 */
-			if (r != NULLRTBLOCK) {
-				*rtblock = r;
+			if (r != NULLRTEXTNO) {
+				*rtx = r;
 				return 0;
 			}
 			/*
@@ -749,14 +735,14 @@ xfs_rtallocate_extent_size(
 			 * allocator is beyond the next bitmap block,
 			 * skip to that bitmap block.
 			 */
-			if (XFS_BITTOBLOCK(mp, n) > i + 1)
-				i = XFS_BITTOBLOCK(mp, n) - 1;
+			if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
+				i = xfs_rtx_to_rbmblock(mp, n) - 1;
 		}
 	}
 	/*
 	 * Got nothing, return failure.
 	 */
-	*rtblock = NULLRTBLOCK;
+	*rtx = NULLRTEXTNO;
 	return 0;
 }
 
@@ -886,12 +872,14 @@ xfs_alloc_rsum_cache(
 	xfs_extlen_t	rbmblocks)	/* number of rt bitmap blocks */
 {
 	/*
-	 * The rsum cache is initialized to all zeroes, which is trivially a
-	 * lower bound on the minimum level with any free extents. We can
-	 * continue without the cache if it couldn't be allocated.
+	 * The rsum cache is initialized to the maximum value, which is
+	 * trivially an upper bound on the maximum level with any free extents.
+	 * We can continue without the cache if it couldn't be allocated.
 	 */
-	mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
-	if (!mp->m_rsum_cache)
+	mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+	if (mp->m_rsum_cache)
+		memset(mp->m_rsum_cache, -1, rbmblocks);
+	else
 		xfs_warn(mp, "could not allocate realtime summary cache");
 }
 
@@ -907,13 +895,13 @@ xfs_growfs_rt(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
 	xfs_growfs_rt_t	*in)		/* growfs rt input struct */
 {
-	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	xfs_fileoff_t	bmbno;		/* bitmap block number */
 	struct xfs_buf	*bp;		/* temporary buffer */
 	int		error;		/* error return value */
 	xfs_mount_t	*nmp;		/* new (fake) mount structure */
 	xfs_rfsblock_t	nrblocks;	/* new number of realtime blocks */
 	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
-	xfs_rtblock_t	nrextents;	/* new number of realtime extents */
+	xfs_rtxnum_t	nrextents;	/* new number of realtime extents */
 	uint8_t		nrextslog;	/* new log2 of sb_rextents */
 	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
 	uint		nrsumlevels;	/* new rt summary levels */
@@ -922,7 +910,6 @@ xfs_growfs_rt(
 	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
 	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
 	xfs_sb_t	*sbp;		/* old superblock */
-	xfs_fsblock_t	sumbno;		/* summary block number */
 	uint8_t		*rsum_cache;	/* old summary cache */
 
 	sbp = &mp->m_sb;
@@ -954,7 +941,7 @@ xfs_growfs_rt(
 		return -EINVAL;
 
 	/* Unsupported realtime features. */
-	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
+	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
@@ -976,11 +963,10 @@ xfs_growfs_rt(
 	 */
 	nrextents = nrblocks;
 	do_div(nrextents, in->extsize);
-	nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
 	nrextslog = xfs_highbit32(nrextents);
 	nrsumlevels = nrextslog + 1;
-	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
-	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
 	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
 	/*
 	 * New summary size can't be more than half the size of
@@ -1023,6 +1009,12 @@ xfs_growfs_rt(
 		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
 	     bmbno < nrbmblocks;
 	     bmbno++) {
+		struct xfs_rtalloc_args	args = {
+			.mp		= mp,
+		};
+		struct xfs_rtalloc_args	nargs = {
+			.mp		= nmp,
+		};
 		struct xfs_trans	*tp;
 		xfs_rfsblock_t		nrblocks_step;
 
@@ -1032,19 +1024,17 @@ xfs_growfs_rt(
 		 * Calculate new sb and mount fields for this round.
 		 */
 		nsbp->sb_rextsize = in->extsize;
+		nmp->m_rtxblklog = -1; /* don't use shift or masking */
 		nsbp->sb_rbmblocks = bmbno + 1;
 		nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
 				nsbp->sb_rextsize;
 		nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
-		nsbp->sb_rextents = nsbp->sb_rblocks;
-		do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+		nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
 		ASSERT(nsbp->sb_rextents != 0);
 		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
 		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
-		nrsumsize =
-			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
-			nsbp->sb_rbmblocks;
-		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
+				nsbp->sb_rbmblocks);
 		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
 		/*
 		 * Start a transaction, get the log reservation.
@@ -1053,6 +1043,9 @@ xfs_growfs_rt(
 				&tp);
 		if (error)
 			break;
+		args.tp = tp;
+		nargs.tp = tp;
+
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
@@ -1086,7 +1079,7 @@ xfs_growfs_rt(
 		 */
 		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
 		    mp->m_rsumlevels != nmp->m_rsumlevels) {
-			error = xfs_rtcopy_summary(mp, nmp, tp);
+			error = xfs_rtcopy_summary(&args, &nargs);
 			if (error)
 				goto error_cancel;
 		}
@@ -1111,9 +1104,9 @@ xfs_growfs_rt(
 		/*
 		 * Free new extent.
 		 */
-		bp = NULL;
-		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
-			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		xfs_rtbuf_cache_relse(&nargs);
 		if (error) {
 error_cancel:
 			xfs_trans_cancel(tp);
@@ -1171,59 +1164,60 @@ out_free:
  * parameters.  The length units are all in realtime extents, as is the
  * result block number.
  */
-int					/* error */
+int
 xfs_rtallocate_extent(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_rtblock_t	bno,		/* starting block number to allocate */
-	xfs_extlen_t	minlen,		/* minimum length to allocate */
-	xfs_extlen_t	maxlen,		/* maximum length to allocate */
-	xfs_extlen_t	*len,		/* out: actual length allocated */
-	int		wasdel,		/* was a delayed allocation extent */
-	xfs_extlen_t	prod,		/* extent product factor */
-	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+	struct xfs_trans	*tp,
+	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
+	int			wasdel,	/* was a delayed allocation extent */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtblock) /* out: start rtext allocated */
 {
-	xfs_mount_t	*mp = tp->t_mountp;
-	int		error;		/* error value */
-	xfs_rtblock_t	r;		/* result allocated block */
-	xfs_fsblock_t	sb;		/* summary file block number */
-	struct xfs_buf	*sumbp;		/* summary file block buffer */
-
-	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	struct xfs_rtalloc_args	args = {
+		.mp		= tp->t_mountp,
+		.tp		= tp,
+	};
+	int			error;	/* error value */
+	xfs_rtxnum_t		r;	/* result allocated rtext */
+
+	ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
 	ASSERT(minlen > 0 && minlen <= maxlen);
 
 	/*
 	 * If prod is set then figure out what to do to minlen and maxlen.
 	 */
 	if (prod > 1) {
-		xfs_extlen_t	i;
+		xfs_rtxlen_t	i;
 
 		if ((i = maxlen % prod))
 			maxlen -= i;
 		if ((i = minlen % prod))
 			minlen += prod - i;
 		if (maxlen < minlen) {
-			*rtblock = NULLRTBLOCK;
+			*rtblock = NULLRTEXTNO;
 			return 0;
 		}
 	}
 
 retry:
-	sumbp = NULL;
-	if (bno == 0) {
-		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
-				&sumbp,	&sb, prod, &r);
+	if (start == 0) {
+		error = xfs_rtallocate_extent_size(&args, minlen,
+				maxlen, len, prod, &r);
 	} else {
-		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
-				len, &sumbp, &sb, prod, &r);
+		error = xfs_rtallocate_extent_near(&args, start, minlen,
+				maxlen, len, prod, &r);
 	}
 
+	xfs_rtbuf_cache_relse(&args);
 	if (error)
 		return error;
 
 	/*
 	 * If it worked, update the superblock.
 	 */
-	if (r != NULLRTBLOCK) {
+	if (r != NULLRTEXTNO) {
 		long	slen = (long)*len;
 
 		ASSERT(*len >= minlen && *len <= maxlen);
@@ -1250,6 +1244,7 @@ xfs_rtmount_init(
 	struct xfs_buf		*bp;	/* buffer for last block of subvolume */
 	struct xfs_sb		*sbp;	/* filesystem superblock copy in mount */
 	xfs_daddr_t		d;	/* address of last block of subvolume */
+	unsigned int		rsumblocks;
 	int			error;
 
 	sbp = &mp->m_sb;
@@ -1261,10 +1256,9 @@ xfs_rtmount_init(
 		return -ENODEV;
 	}
 	mp->m_rsumlevels = sbp->sb_rextslog + 1;
-	mp->m_rsumsize =
-		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
-		sbp->sb_rbmblocks;
-	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
+			mp->m_sb.sb_rbmblocks);
+	mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
 	mp->m_rbmip = mp->m_rsumip = NULL;
 	/*
 	 * Check that the realtime section is an ok size.
@@ -1418,27 +1412,28 @@ xfs_rtunmount_inodes(
  * of rtextents and the fraction.
  * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
  */
-int					/* error */
+int						/* error */
 xfs_rtpick_extent(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_extlen_t	len,		/* allocation length (rtextents) */
-	xfs_rtblock_t	*pick)		/* result rt extent */
+	xfs_mount_t		*mp,		/* file system mount point */
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_rtxlen_t		len,		/* allocation length (rtextents) */
+	xfs_rtxnum_t		*pick)		/* result rt extent */
 {
-	xfs_rtblock_t	b;		/* result block */
-	int		log2;		/* log of sequence number */
-	uint64_t	resid;		/* residual after log removed */
-	uint64_t	seq;		/* sequence number of file creation */
-	uint64_t	*seqp;		/* pointer to seqno in inode */
+	xfs_rtxnum_t		b;		/* result rtext */
+	int			log2;		/* log of sequence number */
+	uint64_t		resid;		/* residual after log removed */
+	uint64_t		seq;		/* sequence number of file creation */
+	struct timespec64	ts;		/* timespec in inode */
 
 	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
-	seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
+	ts = inode_get_atime(VFS_I(mp->m_rbmip));
 	if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
 		mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
-		*seqp = 0;
+		seq = 0;
+	} else {
+		seq = ts.tv_sec;
 	}
-	seq = *seqp;
 	if ((log2 = xfs_highbit64(seq)) == -1)
 		b = 0;
 	else {
@@ -1450,7 +1445,8 @@ xfs_rtpick_extent(
 		if (b + len > mp->m_sb.sb_rextents)
 			b = mp->m_sb.sb_rextents - len;
 	}
-	*seqp = seq + 1;
+	ts.tv_sec = seq + 1;
+	inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
 	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	*pick = b;
 	return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 62c7ad79cbb6..f7cb9ffe51ca 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -11,22 +11,6 @@
 struct xfs_mount;
 struct xfs_trans;
 
-/*
- * XXX: Most of the realtime allocation functions deal in units of realtime
- * extents, not realtime blocks.  This looks funny when paired with the type
- * name and screams for a larger cleanup.
- */
-struct xfs_rtalloc_rec {
-	xfs_rtblock_t		ar_startext;
-	xfs_rtblock_t		ar_extcount;
-};
-
-typedef int (*xfs_rtalloc_query_range_fn)(
-	struct xfs_mount		*mp,
-	struct xfs_trans		*tp,
-	const struct xfs_rtalloc_rec	*rec,
-	void				*priv);
-
 #ifdef CONFIG_XFS_RT
 /*
  * Function prototypes for exported functions.
@@ -40,23 +24,14 @@ typedef int (*xfs_rtalloc_query_range_fn)(
 int					/* error */
 xfs_rtallocate_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_rtblock_t		bno,	/* starting block number to allocate */
-	xfs_extlen_t		minlen,	/* minimum length to allocate */
-	xfs_extlen_t		maxlen,	/* maximum length to allocate */
-	xfs_extlen_t		*len,	/* out: actual length allocated */
+	xfs_rtxnum_t		start,	/* starting rtext number to allocate */
+	xfs_rtxlen_t		minlen,	/* minimum length to allocate */
+	xfs_rtxlen_t		maxlen,	/* maximum length to allocate */
+	xfs_rtxlen_t		*len,	/* out: actual length allocated */
 	int			wasdel,	/* was a delayed allocation extent */
-	xfs_extlen_t		prod,	/* extent product factor */
-	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+	xfs_rtxlen_t		prod,	/* extent product factor */
+	xfs_rtxnum_t		*rtblock); /* out: start rtext allocated */
 
-/*
- * Free an extent in the realtime subvolume.  Length is expressed in
- * realtime extents, as is the block number.
- */
-int					/* error */
-xfs_rtfree_extent(
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_rtblock_t		bno,	/* starting block number to free */
-	xfs_extlen_t		len);	/* length of extent freed */
 
 /*
  * Initialize realtime fields in the mount structure.
@@ -87,8 +62,8 @@ int					/* error */
 xfs_rtpick_extent(
 	struct xfs_mount	*mp,	/* file system mount point */
 	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_extlen_t		len,	/* allocation length (rtextents) */
-	xfs_rtblock_t		*pick);	/* result rt extent */
+	xfs_rtxlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtxnum_t		*pick);	/* result rt extent */
 
 /*
  * Grow the realtime area of the filesystem.
@@ -98,55 +73,12 @@ xfs_growfs_rt(
 	struct xfs_mount	*mp,	/* file system mount structure */
 	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
 
-/*
- * From xfs_rtbitmap.c
- */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
-		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
-int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
-		      xfs_rtblock_t start, xfs_extlen_t len, int val,
-		      xfs_rtblock_t *new, int *stat);
-int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
-		    xfs_rtblock_t start, xfs_rtblock_t limit,
-		    xfs_rtblock_t *rtblock);
-int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
-		    xfs_rtblock_t start, xfs_rtblock_t limit,
-		    xfs_rtblock_t *rtblock);
-int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
-		       xfs_rtblock_t start, xfs_extlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
-			     int log, xfs_rtblock_t bbno, int delta,
-			     struct xfs_buf **rbpp, xfs_fsblock_t *rsb,
-			     xfs_suminfo_t *sum);
-int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
-			 xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp,
-			 xfs_fsblock_t *rsb);
-int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
-		     xfs_rtblock_t start, xfs_extlen_t len,
-		     struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
-		const struct xfs_rtalloc_rec *low_rec,
-		const struct xfs_rtalloc_rec *high_rec,
-		xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
-			  xfs_rtalloc_query_range_fn fn,
-			  void *priv);
-bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
-			       xfs_rtblock_t start, xfs_extlen_t len,
-			       bool *is_free);
 int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
 #else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
-# define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
-# define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtalloc_query_range(t,l,h,f,p)             (ENOSYS)
-# define xfs_rtalloc_query_all(m,t,f,p)                 (ENOSYS)
-# define xfs_rtbuf_get(m,t,b,i,p)                       (ENOSYS)
-# define xfs_verify_rtbno(m, r)			(false)
-# define xfs_rtalloc_extent_is_free(m,t,s,l,i)          (ENOSYS)
-# define xfs_rtalloc_reinit_frextents(m)                (0)
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)	(-ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)			(-ENOSYS)
+# define xfs_growfs_rt(mp,in)				(-ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m)		(0)
 static inline int		/* error */
 xfs_rtmount_init(
 	xfs_mount_t	*mp)	/* file system mount structure */
@@ -157,7 +89,7 @@ xfs_rtmount_init(
 	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
 	return -ENOSYS;
 }
-# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif	/* CONFIG_XFS_RT */
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 819a3568b28f..764304595e8b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,6 +42,7 @@
 #include "xfs_xattr.h"
 #include "xfs_iunlink_item.h"
 #include "xfs_dahash_test.h"
+#include "xfs_rtbitmap.h"
 #include "scrub/stats.h"
 
 #include <linux/magic.h>
@@ -361,14 +362,15 @@ STATIC int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
-	struct block_device	**bdevp)
+	struct bdev_handle	**handlep)
 {
 	int			error = 0;
 
-	*bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
-				    mp->m_super, &fs_holder_ops);
-	if (IS_ERR(*bdevp)) {
-		error = PTR_ERR(*bdevp);
+	*handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+				     mp->m_super, &fs_holder_ops);
+	if (IS_ERR(*handlep)) {
+		error = PTR_ERR(*handlep);
+		*handlep = NULL;
 		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
 	}
 
@@ -433,7 +435,7 @@ xfs_open_devices(
 {
 	struct super_block	*sb = mp->m_super;
 	struct block_device	*ddev = sb->s_bdev;
-	struct block_device	*logdev = NULL, *rtdev = NULL;
+	struct bdev_handle	*logdev_handle = NULL, *rtdev_handle = NULL;
 	int			error;
 
 	/*
@@ -446,17 +448,19 @@ xfs_open_devices(
 	 * Open real time and log devices - order is important.
 	 */
 	if (mp->m_logname) {
-		error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
 		if (error)
 			goto out_relock;
 	}
 
 	if (mp->m_rtname) {
-		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
 		if (error)
 			goto out_close_logdev;
 
-		if (rtdev == ddev || rtdev == logdev) {
+		if (rtdev_handle->bdev == ddev ||
+		    (logdev_handle &&
+		     rtdev_handle->bdev == logdev_handle->bdev)) {
 			xfs_warn(mp,
 	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
 			error = -EINVAL;
@@ -468,22 +472,25 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = -ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
-	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+	if (rtdev_handle) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
-	if (logdev && logdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+	if (logdev_handle && logdev_handle->bdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
 		mp->m_logdev_targp = mp->m_ddev_targp;
+		/* Handle won't be used, drop it */
+		if (logdev_handle)
+			bdev_release(logdev_handle);
 	}
 
 	error = 0;
@@ -497,11 +504,11 @@ out_relock:
  out_free_ddev_targ:
 	xfs_free_buftarg(mp->m_ddev_targp);
  out_close_rtdev:
-	 if (rtdev)
-		 blkdev_put(rtdev, sb);
+	 if (rtdev_handle)
+		bdev_release(rtdev_handle);
  out_close_logdev:
-	if (logdev && logdev != ddev)
-		blkdev_put(logdev, sb);
+	if (logdev_handle)
+		bdev_release(logdev_handle);
 	goto out_relock;
 }
 
@@ -890,7 +897,7 @@ xfs_fs_statfs(
 
 		statp->f_blocks = sbp->sb_rblocks;
 		freertx = percpu_counter_sum_positive(&mp->m_frextents);
-		statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
+		statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
 	}
 
 	return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8c0bfc9a33b1..305c9d07bf1b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,6 +24,7 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_icache.h"
+#include "xfs_rtbitmap.h"
 
 struct kmem_cache	*xfs_trans_cache;
 
@@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb(
 	mp->m_sb.sb_agcount += tp->t_agcount_delta;
 	mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
 	mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+	if (tp->t_rextsize_delta) {
+		mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize);
+		mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize);
+	}
 	mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
 	mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
 	mp->m_sb.sb_rextents += tp->t_rextents_delta;
@@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode(
 
 retry:
 	error = xfs_trans_alloc(mp, resv, dblocks,
-			rblocks / mp->m_sb.sb_rextsize,
+			xfs_extlen_to_rtxlen(mp, rblocks),
 			force ? XFS_TRANS_RESERVE : 0, &tp);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index a3975f325f4e..987843f84d03 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -186,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = {
 	.set	= xfs_xattr_set,
 };
 
-const struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler * const xfs_xattr_handlers[] = {
 	&xfs_xattr_user_handler,
 	&xfs_xattr_trusted_handler,
 	&xfs_xattr_security_handler,
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index 2b09133b1b9b..cec766cad26c 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -8,6 +8,6 @@
 
 int xfs_attr_change(struct xfs_da_args *args);
 
-extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler * const xfs_xattr_handlers[];
 
 #endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9d1a9808fbbb..e6a75401677d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,8 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
 
 	inode->i_ino = ino;
 	inode->i_mode = z->z_mode;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								inode_get_ctime(dir));
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir))));
 	inode->i_uid = z->z_uid;
 	inode->i_gid = z->z_gid;
 	inode->i_size = z->z_wpoffset;
@@ -695,8 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
 	inode->i_ino = ino;
 	inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
 	inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
-								inode_get_ctime(root));
+	inode_set_mtime_to_ts(inode,
+			      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root))));
 	inode->i_private = &sbi->s_zgroup[ztype];
 	set_nlink(inode, 2);
 
@@ -1319,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 
 	inode->i_ino = bdev_nr_zones(sb->s_bdev);
 	inode->i_mode = S_IFDIR | 0555;
-	inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+	simple_inode_init_ts(inode);
 	inode->i_op = &zonefs_dir_inode_operations;
 	inode->i_fop = &zonefs_dir_operations;
 	inode->i_size = 2;