aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/export.c8
-rw-r--r--fs/btrfs/export.h5
-rw-r--r--fs/btrfs/extent-tree.c40
-rw-r--r--fs/btrfs/extent_io.c53
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/file-item.c6
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/inode.c119
-rw-r--r--fs/btrfs/ioctl.c44
-rw-r--r--fs/btrfs/print-tree.c12
-rw-r--r--fs/btrfs/send.c67
-rw-r--r--fs/btrfs/super.c18
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tree-log.c18
-rw-r--r--fs/btrfs/volumes.c21
19 files changed, 303 insertions, 151 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e4d5e6eae409..1cf75d1032e1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1420,6 +1420,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f5a8c0d26cf3..cf1e8ba50f6b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1367,7 +1367,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
- extent_buffer_get(eb_rewin);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1438,8 +1439,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- extent_buffer_get(eb);
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1447,6 +1446,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5412b12491cb..19a668e9164b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1257,6 +1257,7 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
+ atomic_t snapshot_force_cow;
/* For qgroup metadata space reserve */
atomic64_t qgroup_meta_rsv;
@@ -3262,6 +3263,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
static inline __printf(2, 3)
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 096c015b22a4..ace58d6a270b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1200,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
atomic64_set(&root->qgroup_meta_rsv, 0);
+ atomic_set(&root->snapshot_force_cow, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1509,9 +1510,16 @@ int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ }
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
@@ -4337,6 +4345,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3aeb5770f896..b6ce765aa7f3 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -56,9 +56,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
-static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation)
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
@@ -151,7 +151,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
-static struct dentry *btrfs_get_parent(struct dentry *child)
+struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 91b3908e7c54..15db02462141 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -17,4 +17,9 @@ struct btrfs_fid {
u64 parent_root_objectid;
} __attribute__ ((packed));
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation);
+struct dentry *btrfs_get_parent(struct dentry *child);
+
#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51e26f90f0bb..00481cfe6cfc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1170,12 +1170,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -1184,12 +1183,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -1199,8 +1197,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -9365,8 +9364,6 @@ out:
*/
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
- if (err && err != -EAGAIN)
- btrfs_handle_fs_error(fs_info, err, NULL);
return err;
}
@@ -10554,7 +10551,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -10591,7 +10588,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out_put_group;
+ goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -10614,13 +10611,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out_put_group;
+ goto out;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out_put_group;
+ goto out;
btrfs_release_path(path);
}
@@ -10629,6 +10626,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->key.objectid)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -10778,10 +10778,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = remove_block_group_free_space(trans, fs_info, block_group);
if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
+ goto out;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
@@ -10791,10 +10788,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a8be9478ca3e..0ba338cffa93 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1721,7 +1721,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -4322,6 +4323,8 @@ int try_release_extent_mapping(struct extent_map_tree *map,
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(map, tree, page, mask);
@@ -4862,25 +4865,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5344,6 +5350,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -5437,9 +5448,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5460,7 +5471,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (probe_user_write(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e5535bbe6953..90c5095ae97e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -455,9 +455,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 717d82d51bb1..edd5f152e448 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -795,10 +795,12 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
found_next = 1;
- if (ret != 0)
goto insert;
+ }
slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abeb26d48d0a..9bf72a9088ac 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1169,7 +1169,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1334,6 +1333,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
@@ -2169,7 +2169,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2184,7 +2184,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
if (right_info && !right_info->bitmap) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2a196bb134d9..e985e820724e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -629,7 +629,21 @@ cont:
btrfs_free_reserved_data_space_noquota(inode,
start,
end - start + 1);
- goto free_pages_out;
+
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+ }
+
+ return;
}
}
@@ -708,13 +722,6 @@ cleanup_and_bail_uncompressed:
*num_added += 1;
return;
-
-free_pages_out:
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
- }
- kfree(pages);
}
static void free_async_extent_pages(struct async_extent *async_extent)
@@ -978,8 +985,8 @@ static noinline int cow_file_range(struct inode *inode,
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
- u64 disk_num_bytes;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -996,7 +1003,6 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
- disk_num_bytes = num_bytes;
inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
@@ -1023,17 +1029,32 @@ static noinline int cow_file_range(struct inode *inode,
}
}
- BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(fs_info->super_copy));
+ BUG_ON(num_bytes > btrfs_super_total_bytes(fs_info->super_copy));
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
- while (disk_num_bytes > 0) {
- cur_alloc_size = disk_num_bytes;
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
+ while (num_bytes > 0) {
+ cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1097,11 +1118,10 @@ static noinline int cow_file_range(struct inode *inode,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
- if (disk_num_bytes < cur_alloc_size)
- disk_num_bytes = 0;
+ if (num_bytes < cur_alloc_size)
+ num_bytes = 0;
else
- disk_num_bytes -= cur_alloc_size;
- num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
extent_reserved = false;
@@ -1139,8 +1159,8 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size,
- start + cur_alloc_size,
+ start + cur_alloc_size - 1,
+ start + cur_alloc_size - 1,
locked_page,
clear_bits,
page_ops);
@@ -1315,7 +1335,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_num_bytes;
u64 ram_bytes;
int extent_type;
- int ret, err;
+ int ret;
int type;
int nocow;
int check_prev = 1;
@@ -1440,11 +1460,8 @@ next_slot:
* if there are pending snapshots for this root,
* we fall into common COW way.
*/
- if (!nolock) {
- err = btrfs_start_write_no_snapshotting(root);
- if (!err)
- goto out_check;
- }
+ if (!nolock && atomic_read(&root->snapshot_force_cow))
+ goto out_check;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
@@ -1453,9 +1470,6 @@ next_slot:
ret = csum_exist_in_range(fs_info, disk_bytenr,
num_bytes);
if (ret) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
-
/*
* ret could be -EIO if the above fails to read
* metadata.
@@ -1468,11 +1482,8 @@ next_slot:
WARN_ON_ONCE(nolock);
goto out_check;
}
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
+ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
- }
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -1485,8 +1496,6 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
goto next_slot;
@@ -1508,8 +1517,6 @@ out_check:
end, page_started, nr_written, 1,
NULL);
if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1529,8 +1536,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1569,8 +1574,6 @@ out_check:
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_PRIVATE2);
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
cur_offset = extent_end;
/*
@@ -8707,7 +8710,6 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
/* bio split */
ASSERT(map_length <= INT_MAX);
- atomic_inc(&dip->pending_bios);
do {
clone_len = min_t(int, submit_len, map_length);
@@ -8758,7 +8760,8 @@ submit:
if (!status)
return 0;
- bio_put(bio);
+ if (bio != orig_bio)
+ bio_put(bio);
out_err:
dip->errors = 1;
/*
@@ -8798,7 +8801,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
bio->bi_private = dip;
dip->orig_bio = bio;
dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 0);
+ atomic_set(&dip->pending_bios, 1);
io_bio = btrfs_io_bio(bio);
io_bio->logical = file_offset;
@@ -8947,9 +8950,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
@@ -9187,20 +9187,17 @@ again:
/*
* Qgroup reserved space handler
* Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
+ * 1) Already written to disk or ordered extent already submitted
+ * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
+ * bit of its io_tree, and free the qgroup reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e82b4f3f490c..56123ce3b9f0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -655,6 +655,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
+ bool snapshot_force_cow = false;
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
@@ -671,6 +672,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshotting_writes(root);
@@ -679,6 +685,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
@@ -744,6 +758,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
if (atomic_dec_and_test(&root->will_be_snapshotted))
wake_up_atomic_t(&root->will_be_snapshotted);
free_pending:
@@ -2021,9 +2037,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2031,10 +2052,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2122,6 +2147,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 47336d4b19d8..f96eb176166d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -116,9 +116,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -133,8 +134,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ca15d65a2070..654ab6e57ec3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -35,6 +35,7 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
+#include "xattr.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -4554,6 +4555,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
+ /* Capabilities are emitted by finish_inode_if_needed */
+ if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ return 0;
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5096,6 +5101,64 @@ static int send_extent_data(struct send_ctx *sctx,
return 0;
}
+/*
+ * Search for a capability xattr related to sctx->cur_ino. If the capability is
+ * found, call send_set_xattr function to emit it.
+ *
+ * Return 0 if there isn't a capability, or when the capability was emitted
+ * successfully, or < 0 if an error occurred.
+ */
+static int send_capabilities(struct send_ctx *sctx)
+{
+ struct fs_path *fspath = NULL;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ unsigned long data_ptr;
+ char *buf = NULL;
+ int buf_len;
+ int ret = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ if (!di) {
+ /* There is no xattr for this inode */
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ buf_len = btrfs_dir_data_len(leaf, di);
+
+ fspath = fs_path_alloc();
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!fspath || !buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ read_extent_buffer(leaf, buf, data_ptr, buf_len);
+
+ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ strlen(XATTR_NAME_CAPS), buf, buf_len);
+out:
+ kfree(buf);
+ fs_path_free(fspath);
+ btrfs_free_path(path);
+ return ret;
+}
+
static int clone_range(struct send_ctx *sctx,
struct clone_root *clone_root,
const u64 disk_byte,
@@ -5907,6 +5970,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
+ ret = send_capabilities(sctx);
+ if (ret < 0)
+ goto out;
+
/*
* If other directory inodes depended on our current directory
* inode's move/rename, now do their move/rename operations.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 17a8463ef35c..eb64d4b159e0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -939,8 +939,8 @@ out:
return error;
}
-static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid)
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *fs_root;
@@ -1221,6 +1221,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1307,8 +1308,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
#endif
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
@@ -1427,8 +1433,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
goto out;
}
}
- subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
- subvol_objectid);
+ subvol_name = btrfs_get_subvol_name_from_objectid(
+ btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f05341bda1d1..383546ff62f0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -25,6 +25,7 @@
#include <linux/bug.h>
#include <linux/genhd.h>
#include <linux/debugfs.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "disk-io.h"
@@ -749,7 +750,9 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
{
int error = 0;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ nofs_flag = memalloc_nofs_save();
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -768,6 +771,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
if (error)
break;
}
+ memalloc_nofs_restore(nofs_flag);
return error;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bcfb7a772c8e..ec8706a6e9c6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3311,11 +3311,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(root->fs_info, trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -3854,11 +3856,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -3871,7 +3870,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6d34842912e8..f3cb042a28d5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
@@ -4174,6 +4175,7 @@ static int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, fs_info,
root_item.uuid,
@@ -4199,6 +4201,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4206,7 +4209,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -6277,8 +6279,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return NULL;
@@ -6902,6 +6913,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_info->chunk_mutex);
/*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
+
+ /*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk