diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/backref.c | 1 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 8 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 3 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 15 | ||||
-rw-r--r-- | fs/btrfs/export.c | 8 | ||||
-rw-r--r-- | fs/btrfs/export.h | 5 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 40 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 53 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 6 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 6 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 6 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 119 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 44 | ||||
-rw-r--r-- | fs/btrfs/print-tree.c | 12 | ||||
-rw-r--r-- | fs/btrfs/send.c | 67 | ||||
-rw-r--r-- | fs/btrfs/super.c | 18 | ||||
-rw-r--r-- | fs/btrfs/sysfs.c | 4 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 18 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 21 |
19 files changed, 303 insertions, 151 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e4d5e6eae409..1cf75d1032e1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1420,6 +1420,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); + *roots = NULL; return ret; } node = ulist_next(tmp, &uiter); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f5a8c0d26cf3..cf1e8ba50f6b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1367,7 +1367,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1438,8 +1439,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); @@ -1447,6 +1446,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); if (tm) __tree_mod_log_rewind(fs_info, eb, time_seq, tm); else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5412b12491cb..19a668e9164b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1257,6 +1257,7 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; + atomic_t snapshot_force_cow; /* For qgroup metadata space reserve */ atomic64_t qgroup_meta_rsv; @@ -3262,6 +3263,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); static inline __printf(2, 3) void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 096c015b22a4..ace58d6a270b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1200,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); + atomic_set(&root->snapshot_force_cow, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1509,9 +1510,16 @@ int btrfs_init_fs_root(struct btrfs_root *root) spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; + /* + * Don't assign anonymous block device to roots that are not exposed to + * userspace, the id pool is limited to 1M + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, @@ -4337,6 +4345,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) cache->io_ctl.inode = NULL; iput(inode); } + ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 3aeb5770f896..b6ce765aa7f3 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -56,9 +56,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -151,7 +151,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } -static struct dentry *btrfs_get_parent(struct dentry *child) +struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 91b3908e7c54..15db02462141 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -17,4 +17,9 @@ struct btrfs_fid { u64 parent_root_objectid; } __attribute__ ((packed)); +struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation, + int check_generation); +struct dentry *btrfs_get_parent(struct dentry *child); + #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51e26f90f0bb..00481cfe6cfc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1170,12 +1170,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { @@ -1184,12 +1183,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(eb->fs_info); /* - * Every shared one has parent tree - * block, which must be aligned to - * nodesize. + * Every shared one has parent tree block, + * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->nodesize)) + IS_ALIGNED(offset, eb->fs_info->sectorsize)) return type; } } else { @@ -1199,8 +1197,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } btrfs_print_leaf((struct extent_buffer *)eb); - btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", - eb->start, type); + btrfs_err(eb->fs_info, + "eb %llu iref 0x%lx invalid extent inline ref type %d", + eb->start, (unsigned long)iref, type); WARN_ON(1); return BTRFS_REF_TYPE_INVALID; @@ -9365,8 +9364,6 @@ out: */ if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); - if (err && err != -EAGAIN) - btrfs_handle_fs_error(fs_info, err, NULL); return err; } @@ -10554,7 +10551,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out_put_group; + goto out; } /* @@ -10591,7 +10588,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out_put_group; + goto out; } clear_nlink(inode); /* One for the block groups ref */ @@ -10614,13 +10611,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out_put_group; + goto out; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out_put_group; + goto out; btrfs_release_path(path); } @@ -10629,6 +10626,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); + /* Once for the block groups rbtree */ + btrfs_put_block_group(block_group); + if (fs_info->first_logical_byte == block_group->key.objectid) fs_info->first_logical_byte = (u64)-1; spin_unlock(&fs_info->block_group_cache_lock); @@ -10778,10 +10778,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = remove_block_group_free_space(trans, fs_info, block_group); if (ret) - goto out_put_group; - - /* Once for the block groups rbtree */ - btrfs_put_block_group(block_group); + goto out; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) @@ -10791,10 +10788,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); -out_put_group: +out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); -out: btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a8be9478ca3e..0ba338cffa93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1721,7 +1721,8 @@ static int __process_pages_contig(struct address_space *mapping, if (!PageDirty(pages[i]) || pages[i]->mapping != mapping) { unlock_page(pages[i]); - put_page(pages[i]); + for (; i < ret; i++) + put_page(pages[i]); err = -EAGAIN; goto out; } @@ -4322,6 +4323,8 @@ int try_release_extent_mapping(struct extent_map_tree *map, /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(map, tree, page, mask); @@ -4862,25 +4865,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page + /* + * The TREE_REF bit is first set when the extent_buffer is added + * to the radix tree. It is also reset, if unset, when a new reference + * is created by find_extent_buffer. * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. + * It is only cleared in two cases: freeing the last non-tree + * reference to the extent_buffer when its STALE bit is set or + * calling releasepage when the tree reference is the only reference. * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. + * In both cases, care is taken to ensure that the extent_buffer's + * pages are not under io. However, releasepage can be concurrently + * called with creating new references, which is prone to race + * conditions between the calls to check_buffer_tree_ref in those + * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. + * The actual lifetime of the extent_buffer in the radix tree is + * adequately protected by the refcount, but the TREE_REF bit and + * its corresponding reference are not. To protect against this + * class of races, we call check_buffer_tree_ref from the codepaths + * which trigger io after they set eb->io_pages. Note that once io is + * initiated, TREE_REF can no longer be cleared, so that is the + * moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5344,6 +5350,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); + /* + * It is possible for releasepage to clear the TREE_REF bit before we + * set io_pages. See check_buffer_tree_ref for a more detailed comment. + */ + check_buffer_tree_ref(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; @@ -5437,9 +5448,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dstv, - unsigned long start, unsigned long len) +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5460,7 +5471,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); - if (copy_to_user(dst, kaddr + offset, cur)) { + if (probe_user_write(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e5535bbe6953..90c5095ae97e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -455,9 +455,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dst, unsigned long start, - unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, const void *src); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 717d82d51bb1..edd5f152e448 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -795,10 +795,12 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); - if (ret == 1) + if (ret < 0) { + goto out; + } else if (ret > 0) { found_next = 1; - if (ret != 0) goto insert; + } slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abeb26d48d0a..9bf72a9088ac 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1169,7 +1169,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: - io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; @@ -1334,6 +1333,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * them out later */ io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); @@ -2169,7 +2169,7 @@ out: static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *left_info; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; @@ -2184,7 +2184,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, if (right_info && rb_prev(&right_info->offset_index)) left_info = rb_entry(rb_prev(&right_info->offset_index), struct btrfs_free_space, offset_index); - else + else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); if (right_info && !right_info->bitmap) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2a196bb134d9..e985e820724e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -629,7 +629,21 @@ cont: btrfs_free_reserved_data_space_noquota(inode, start, end - start + 1); - goto free_pages_out; + + /* + * Ensure we only free the compressed pages if we have + * them allocated, as we can still reach here with + * inode_need_compress() == false. + */ + if (pages) { + for (i = 0; i < nr_pages; i++) { + WARN_ON(pages[i]->mapping); + put_page(pages[i]); + } + kfree(pages); + } + + return; } } @@ -708,13 +722,6 @@ cleanup_and_bail_uncompressed: *num_added += 1; return; - -free_pages_out: - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); } static void free_async_extent_pages(struct async_extent *async_extent) @@ -978,8 +985,8 @@ static noinline int cow_file_range(struct inode *inode, u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; - u64 disk_num_bytes; u64 cur_alloc_size = 0; + u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; @@ -996,7 +1003,6 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); @@ -1023,17 +1029,32 @@ static noinline int cow_file_range(struct inode *inode, } } - BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(fs_info->super_copy)); + BUG_ON(num_bytes > btrfs_super_total_bytes(fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); - while (disk_num_bytes > 0) { - cur_alloc_size = disk_num_bytes; + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + min_alloc_size = num_bytes; + else + min_alloc_size = fs_info->sectorsize; + + while (num_bytes > 0) { + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, - fs_info->sectorsize, 0, alloc_hint, + min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -1097,11 +1118,10 @@ static noinline int cow_file_range(struct inode *inode, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); - if (disk_num_bytes < cur_alloc_size) - disk_num_bytes = 0; + if (num_bytes < cur_alloc_size) + num_bytes = 0; else - disk_num_bytes -= cur_alloc_size; - num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; extent_reserved = false; @@ -1139,8 +1159,8 @@ out_unlock: */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size, - start + cur_alloc_size, + start + cur_alloc_size - 1, + start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); @@ -1315,7 +1335,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_num_bytes; u64 ram_bytes; int extent_type; - int ret, err; + int ret; int type; int nocow; int check_prev = 1; @@ -1440,11 +1460,8 @@ next_slot: * if there are pending snapshots for this root, * we fall into common COW way. */ - if (!nolock) { - err = btrfs_start_write_no_snapshotting(root); - if (!err) - goto out_check; - } + if (!nolock && atomic_read(&root->snapshot_force_cow)) + goto out_check; /* * force cow if csum exists in the range. * this ensure that csum for a given extent are @@ -1453,9 +1470,6 @@ next_slot: ret = csum_exist_in_range(fs_info, disk_bytenr, num_bytes); if (ret) { - if (!nolock) - btrfs_end_write_no_snapshotting(root); - /* * ret could be -EIO if the above fails to read * metadata. @@ -1468,11 +1482,8 @@ next_slot: WARN_ON_ONCE(nolock); goto out_check; } - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { - if (!nolock) - btrfs_end_write_no_snapshotting(root); + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - } nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1485,8 +1496,6 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); goto next_slot; @@ -1508,8 +1517,6 @@ out_check: end, page_started, nr_written, 1, NULL); if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1529,8 +1536,6 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1569,8 +1574,6 @@ out_check: EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_PRIVATE2); - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); cur_offset = extent_end; /* @@ -8707,7 +8710,6 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) /* bio split */ ASSERT(map_length <= INT_MAX); - atomic_inc(&dip->pending_bios); do { clone_len = min_t(int, submit_len, map_length); @@ -8758,7 +8760,8 @@ submit: if (!status) return 0; - bio_put(bio); + if (bio != orig_bio) + bio_put(bio); out_err: dip->errors = 1; /* @@ -8798,7 +8801,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, bio->bi_private = dip; dip->orig_bio = bio; dip->dio_bio = dio_bio; - atomic_set(&dip->pending_bios, 0); + atomic_set(&dip->pending_bios, 1); io_bio = btrfs_io_bio(bio); io_bio->logical = file_offset; @@ -8947,9 +8950,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; } ret = btrfs_delalloc_reserve_space(inode, &data_reserved, offset, count); @@ -9187,20 +9187,17 @@ again: /* * Qgroup reserved space handler * Page here will be either - * 1) Already written to disk - * In this case, its reserved space is released from data rsv map - * and will be freed by delayed_ref handler finally. - * So even we call qgroup_free_data(), it won't decrease reserved - * space. - * 2) Not written to disk - * This means the reserved space should be freed here. However, - * if a truncate invalidates the page (by clearing PageDirty) - * and the page is accounted for while allocating extent - * in btrfs_check_data_free_space() we let delayed_ref to - * free the entire extent. + * 1) Already written to disk or ordered extent already submitted + * Then its QGROUP_RESERVED bit in io_tree is already cleaned. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED + * bit of its io_tree, and free the qgroup reserved data space. + * Since the IO will never happen for this page. */ - if (PageDirty(page)) - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e82b4f3f490c..56123ce3b9f0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -655,6 +655,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; + bool snapshot_force_cow = false; if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; @@ -671,6 +672,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } + /* + * Force new buffered writes to reserve space even when NOCOW is + * possible. This is to avoid later writeback (running dealloc) to + * fallback to COW mode and unexpectedly fail with ENOSPC. + */ atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshotting_writes(root); @@ -679,6 +685,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; + /* + * All previous writes have started writeback in NOCOW mode, so now + * we force future writes to fallback to COW mode during snapshot + * creation. + */ + atomic_inc(&root->snapshot_force_cow); + snapshot_force_cow = true; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, @@ -744,6 +758,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, fail: btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: + if (snapshot_force_cow) + atomic_dec(&root->snapshot_force_cow); if (atomic_dec_and_test(&root->will_be_snapshotted)) wake_up_atomic_t(&root->will_be_snapshotted); free_pending: @@ -2021,9 +2037,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, sh.len = item_len; sh.transid = found_transid; - /* copy search result header */ - if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = -EFAULT; + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; goto out; } @@ -2031,10 +2052,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, if (item_len) { char __user *up = ubuf + *sk_offset; - /* copy the item */ - if (read_extent_buffer_to_user(leaf, up, - item_off, item_len)) { - ret = -EFAULT; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); goto out; } @@ -2122,6 +2147,11 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { + ret = fault_in_pages_writeable(ubuf + sk_offset, + *buf_size - sk_offset); + if (ret) + break; + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 47336d4b19d8..f96eb176166d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -116,9 +116,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * offset is supposed to be a tree block which * must be aligned to nodesize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -133,8 +134,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) * must be aligned to nodesize. */ if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) - pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", - offset, (unsigned long long)eb->fs_info->nodesize); + pr_info( + "\t\t\t(parent %llu not aligned to sectorsize %u)\n", + offset, eb->fs_info->sectorsize); break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ca15d65a2070..654ab6e57ec3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -35,6 +35,7 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" +#include "xattr.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -4554,6 +4555,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; struct posix_acl_xattr_header dummy_acl; + /* Capabilities are emitted by finish_inode_if_needed */ + if (!strncmp(name, XATTR_NAME_CAPS, name_len)) + return 0; + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -5096,6 +5101,64 @@ static int send_extent_data(struct send_ctx *sctx, return 0; } +/* + * Search for a capability xattr related to sctx->cur_ino. If the capability is + * found, call send_set_xattr function to emit it. + * + * Return 0 if there isn't a capability, or when the capability was emitted + * successfully, or < 0 if an error occurred. + */ +static int send_capabilities(struct send_ctx *sctx) +{ + struct fs_path *fspath = NULL; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + unsigned long data_ptr; + char *buf = NULL; + int buf_len; + int ret = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, + XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); + if (!di) { + /* There is no xattr for this inode */ + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + buf_len = btrfs_dir_data_len(leaf, di); + + fspath = fs_path_alloc(); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!fspath || !buf) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); + read_extent_buffer(leaf, buf, data_ptr, buf_len); + + ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + strlen(XATTR_NAME_CAPS), buf, buf_len); +out: + kfree(buf); + fs_path_free(fspath); + btrfs_free_path(path); + return ret; +} + static int clone_range(struct send_ctx *sctx, struct clone_root *clone_root, const u64 disk_byte, @@ -5907,6 +5970,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } + ret = send_capabilities(sctx); + if (ret < 0) + goto out; + /* * If other directory inodes depended on our current directory * inode's move/rename, now do their move/rename operations. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 17a8463ef35c..eb64d4b159e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -939,8 +939,8 @@ out: return error; } -static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid) +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_root *fs_root; @@ -1221,6 +1221,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); char *compress_type; + const char *subvol_name; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1307,8 +1308,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); - seq_puts(seq, ",subvol="); - seq_dentry(seq, dentry, " \t\n\\"); + subvol_name = btrfs_get_subvol_name_from_objectid(info, + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + if (!IS_ERR(subvol_name)) { + seq_puts(seq, ",subvol="); + seq_escape(seq, subvol_name, " \t\n\\"); + kfree(subvol_name); + } return 0; } @@ -1427,8 +1433,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, goto out; } } - subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), - subvol_objectid); + subvol_name = btrfs_get_subvol_name_from_objectid( + btrfs_sb(mnt->mnt_sb), subvol_objectid); if (IS_ERR(subvol_name)) { root = ERR_CAST(subvol_name); subvol_name = NULL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f05341bda1d1..383546ff62f0 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -25,6 +25,7 @@ #include <linux/bug.h> #include <linux/genhd.h> #include <linux/debugfs.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" @@ -749,7 +750,9 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, { int error = 0; struct btrfs_device *dev; + unsigned int nofs_flag; + nofs_flag = memalloc_nofs_save(); list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -768,6 +771,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, if (error) break; } + memalloc_nofs_restore(nofs_flag); return error; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcfb7a772c8e..ec8706a6e9c6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3311,11 +3311,13 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (ret == -ENOSPC) { + if (err == -ENOSPC) { btrfs_set_log_full_commit(root->fs_info, trans); - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, ret); + err = 0; + } else if (err < 0 && err != -ENOENT) { + /* ENOENT can be returned if the entry hasn't been fsynced yet */ + btrfs_abort_transaction(trans, err); + } btrfs_end_log_trans(root); @@ -3854,11 +3856,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - if (ret) { - btrfs_release_path(dst_path); - kfree(ins_data); - return ret; - } + if (ret) + break; } } } @@ -3871,7 +3870,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ - ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6d34842912e8..f3cb042a28d5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/buffer_head.h> @@ -4174,6 +4175,7 @@ static int btrfs_uuid_scan_kthread(void *data) goto skip; } update_tree: + btrfs_release_path(path); if (!btrfs_is_empty_uuid(root_item.uuid)) { ret = btrfs_uuid_tree_add(trans, fs_info, root_item.uuid, @@ -4199,6 +4201,7 @@ update_tree: } skip: + btrfs_release_path(path); if (trans) { ret = btrfs_end_transaction(trans); trans = NULL; @@ -4206,7 +4209,6 @@ skip: break; } - btrfs_release_path(path); if (key.offset < (u64)-1) { key.offset++; } else if (key.type < BTRFS_ROOT_ITEM_KEY) { @@ -6277,8 +6279,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; + unsigned int nofs_flag; + /* + * We call this under the chunk_mutex, so we want to use NOFS for this + * allocation, however we don't want to change btrfs_alloc_device() to + * always do NOFS because we use it in a lot of other GFP_KERNEL safe + * places. + */ + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, dev_uuid); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return NULL; @@ -6902,6 +6913,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) mutex_lock(&fs_info->chunk_mutex); /* + * It is possible for mount and umount to race in such a way that + * we execute this code path, but open_fs_devices failed to clear + * total_rw_bytes. We certainly want it cleared before reading the + * device items, so clear it here. + */ + fs_info->fs_devices->total_rw_bytes = 0; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk |