diff options
Diffstat (limited to 'fs/btrfs')
40 files changed, 913 insertions, 344 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c701a19fac53..16e5e07cb5d1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -136,6 +136,7 @@ struct share_check { u64 root_objectid; u64 inum; int share_count; + bool have_delayed_delete_refs; }; static inline int extent_is_shared(struct share_check *sc) @@ -286,8 +287,10 @@ static void prelim_release(struct preftree *preftree) struct prelim_ref *ref, *next_ref; rbtree_postorder_for_each_entry_safe(ref, next_ref, - &preftree->root.rb_root, rbnode) + &preftree->root.rb_root, rbnode) { + free_inode_elem_list(ref->inode_list); free_pref(ref); + } preftree->root = RB_ROOT_CACHED; preftree->count = 0; @@ -428,6 +431,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, u64 wanted_disk_byte = ref->wanted_disk_byte; u64 count = 0; u64 data_offset; + u8 type; if (level != 0) { eb = path->nodes[level]; @@ -482,6 +486,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(eb, fi); + if (type == BTRFS_FILE_EXTENT_INLINE) + goto next; disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); data_offset = btrfs_file_extent_offset(eb, fi); @@ -641,6 +648,18 @@ unode_aux_to_inode_list(struct ulist_node *node) return (struct extent_inode_elem *)(uintptr_t)node->aux; } +static void free_leaf_list(struct ulist *ulist) +{ + struct ulist_node *node; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(ulist, &uiter))) + free_inode_elem_list(unode_aux_to_inode_list(node)); + + ulist_free(ulist); +} + /* * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not @@ -755,7 +774,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, cond_resched(); } out: - ulist_free(parents); + /* + * We may have inode lists attached to refs in the parents ulist, so we + * must free them before freeing the ulist and its refs. + */ + free_leaf_list(parents); return ret; } @@ -812,16 +835,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct preftrees *preftrees, struct share_check *sc) { struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key tmp_op_key; struct rb_node *n; int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); - spin_lock(&head->lock); for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) { node = rb_entry(n, struct btrfs_delayed_ref_node, @@ -847,10 +865,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; + struct btrfs_key *key_ptr = NULL; + + if (head->extent_op && head->extent_op->update_key) { + btrfs_disk_key_to_cpu(&key, &head->extent_op->key); + key_ptr = &key; + } ref = btrfs_delayed_node_to_tree_ref(node); ret = add_indirect_ref(fs_info, preftrees, ref->root, - &tmp_op_key, ref->level + 1, + key_ptr, ref->level + 1, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -876,13 +900,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, key.offset = ref->offset; /* - * Found a inum that doesn't match our known inum, we - * know it's shared. + * If we have a share check context and a reference for + * another inode, we can't exit immediately. This is + * because even if this is a BTRFS_ADD_DELAYED_REF + * reference we may find next a BTRFS_DROP_DELAYED_REF + * which cancels out this ADD reference. + * + * If this is a DROP reference and there was no previous + * ADD reference, then we need to signal that when we + * process references from the extent tree (through + * add_inline_refs() and add_keyed_refs()), we should + * not exit early if we find a reference for another + * inode, because one of the delayed DROP references + * may cancel that reference in the extent tree. */ - if (sc && sc->inum && ref->objectid != sc->inum) { - ret = BACKREF_FOUND_SHARED; - goto out; - } + if (sc && count < 0) + sc->have_delayed_delete_refs = true; ret = add_indirect_ref(fs_info, preftrees, ref->root, &key, 0, node->bytenr, count, sc, @@ -912,7 +945,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } if (!ret) ret = extent_is_shared(sc); -out: + spin_unlock(&head->lock); return ret; } @@ -1015,7 +1048,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1025,6 +1059,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, ret = add_indirect_ref(fs_info, preftrees, root, &key, 0, bytenr, count, sc, GFP_NOFS); + break; } default: @@ -1114,7 +1149,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum) { + if (sc && sc->inum && key.objectid != sc->inum && + !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; } @@ -1353,6 +1389,12 @@ again: if (ret < 0) goto out; ref->inode_list = eie; + /* + * We transferred the list ownership to the ref, + * so set to NULL to avoid a double free in case + * an error happens after this. + */ + eie = NULL; } ret = ulist_add_merge_ptr(refs, ref->parent, ref->inode_list, @@ -1378,6 +1420,14 @@ again: eie->next = ref->inode_list; } eie = NULL; + /* + * We have transferred the inode list ownership from + * this ref to the ref we added to the 'refs' ulist. + * So set this ref's inode list to NULL to avoid + * use-after-free when our caller uses it or double + * frees in case an error happens before we return. + */ + ref->inode_list = NULL; } cond_resched(); } @@ -1394,24 +1444,6 @@ out: return ret; } -static void free_leaf_list(struct ulist *blocks) -{ - struct ulist_node *node = NULL; - struct extent_inode_elem *eie; - struct ulist_iterator uiter; - - ULIST_ITER_INIT(&uiter); - while ((node = ulist_next(blocks, &uiter))) { - if (!node->aux) - continue; - eie = unode_aux_to_inode_list(node); - free_inode_elem_list(eie); - node->aux = 0; - } - - ulist_free(blocks); -} - /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -1537,6 +1569,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, .root_objectid = root->root_key.objectid, .inum = inum, .share_count = 0, + .have_delayed_delete_refs = false, }; ulist_init(roots); @@ -1571,6 +1604,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, break; bytenr = node->val; shared.share_count = 0; + shared.have_delayed_delete_refs = false; cond_resched(); } @@ -2257,20 +2291,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) size_t alloc_bytes; alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kvmalloc(alloc_bytes, GFP_KERNEL); + data = kvzalloc(alloc_bytes, GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); - if (total_bytes >= sizeof(*data)) { + if (total_bytes >= sizeof(*data)) data->bytes_left = total_bytes - sizeof(*data); - data->bytes_missing = 0; - } else { + else data->bytes_missing = sizeof(*data) - total_bytes; - data->bytes_left = 0; - } - - data->elem_cnt = 0; - data->elem_missed = 0; return data; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bcf19dfb0af3..278933cd3a09 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2938,6 +2938,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 343400d49bd1..63205d2f4d84 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -29,7 +29,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + if (qgroup_to_release_ret && + block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { qgroup_to_release = block_rsv->qgroup_rsv_reserved - block_rsv->qgroup_rsv_size; block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; @@ -391,7 +392,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (unlikely(block_rsv->size == 0)) + if (unlikely(btrfs_block_rsv_size(block_rsv) == 0)) goto try_reserve; again: ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5..69770360917c 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -98,4 +98,20 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, block_rsv, 0); } +/* + * Get the size of a block reserve in a context where getting a stale value is + * acceptable, instead of accessing it directly and trigger data race warning + * from KCSAN. + */ +static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->size; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 822c615840e8..608e41b61689 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3589,6 +3589,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); btrfs_abort_transaction(trans, ret); return ret; } @@ -5138,10 +5140,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; + struct btrfs_key orig_key; struct btrfs_disk_key found_key; int ret; btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; if (key.offset > 0) { key.offset--; @@ -5158,8 +5162,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret <= 0) return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd77c0621a55..b141a7ba4507 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -482,8 +482,6 @@ struct btrfs_swapfile_pin { bool is_block_group; }; -bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); - enum { BTRFS_FS_BARRIER, BTRFS_FS_CLOSING_START, @@ -2727,7 +2725,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index f4f531c4aa96..d3f16dc33d0f 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -324,9 +324,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); } if (delalloc_lock) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e96890475bac..95afe5ef7500 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1137,6 +1137,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_record_root_in_trans(trans, node->root); + if (ret) + return ret; ret = btrfs_update_delayed_inode(trans, node->root, path, node); return ret; } @@ -1175,20 +1178,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1cb7f5d79765..4abc0db6527e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -125,7 +125,7 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { btrfs_err(fs_info, - "replace devid present without an active replace item"); +"replace without active item, run 'device scan --forget' on the target device"); ret = -EUCLEAN; } else { dev_replace->srcdev = NULL; @@ -535,6 +535,23 @@ leave: return ret; } +static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args) +{ + if (args->start.srcdevid == 0) { + if (memchr(args->start.srcdev_name, 0, + sizeof(args->start.srcdev_name)) == NULL) + return -ENAMETOOLONG; + } else { + args->start.srcdev_name[0] = 0; + } + + if (memchr(args->start.tgtdev_name, 0, + sizeof(args->start.tgtdev_name)) == NULL) + return -ENAMETOOLONG; + + return 0; +} + int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { @@ -547,10 +564,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, default: return -EINVAL; } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; + ret = btrfs_check_replace_dev_names(args); + if (ret < 0) + return ret; ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, @@ -918,8 +934,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) up_write(&dev_replace->rwsem); /* Scrub for replace must not be running in suspended state */ - ret = btrfs_scrub_cancel(fs_info); - ASSERT(ret != -ENOTCONN); + btrfs_scrub_cancel(fs_info); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 863367c2c620..98c6faa8ce15 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -171,10 +171,40 @@ out_free: return 0; } +static struct btrfs_dir_item *btrfs_lookup_match_dir( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, const char *name, + int name_len, int mod) +{ + const int ins_len = (mod < 0 ? -1 : 0); + const int cow = (mod != 0); + int ret; + + ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); +} + /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -182,23 +212,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -212,7 +237,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int slot; struct btrfs_path *path; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -221,20 +245,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - /* return back any errors */ - if (ret < 0) - goto out; + di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + /* Nothing found, we're safe */ + if (ret == -ENOENT) { + ret = 0; + goto out; + } - /* nothing found, we're safe */ - if (ret > 0) { - ret = 0; - goto out; + if (ret < 0) + goto out; } /* we found an item, look for our name in the item */ - di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len); if (di) { /* our exact name was found */ ret = -EEXIST; @@ -261,35 +285,42 @@ out: } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. * - * The name is used to make sure the index really points to the name you were - * looking for. + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { - int ret; + struct btrfs_dir_item *di; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * @@ -346,21 +377,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f18c6d97932e..b4ed11b5f148 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2239,6 +2239,23 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + /* + * Check if the checksum implementation is a fast accelerated one. + * As-is this is a bit of a hack and should be replaced once the csum + * implementations provide that information themselves. + */ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + default: + break; + } + + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } @@ -2463,21 +2480,18 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", - fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, METADATA_UUID) && - memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", - fs_info->super_copy->metadata_uuid, - fs_info->fs_devices->metadata_uuid); + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } @@ -2927,7 +2941,7 @@ int open_ctree(struct super_block *sb, ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", + "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_csum; @@ -2965,11 +2979,25 @@ int open_ctree(struct super_block *sb, ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", + "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_csum; } + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata write, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (unlikely(features && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY))) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + features); + err = -EINVAL; + goto fail_alloc; + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { @@ -4075,6 +4103,11 @@ void close_ctree(struct btrfs_fs_info *fs_info) ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); + if (btrfs_check_quota_leak(fs_info)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err(fs_info, "qgroup reserved space leaked"); + } + btrfs_free_qgroup_config(fs_info); ASSERT(list_empty(&fs_info->delalloc_roots)); @@ -4377,7 +4410,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); iput(inode); } spin_lock(&root->delalloc_lock); @@ -4495,7 +4532,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) inode = cache->io_ctl.inode; if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 93cceeba484c..08d1d456e2f0 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -182,8 +182,15 @@ struct dentry *btrfs_get_parent(struct dentry *child) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; + if (ret == 0) { + /* + * Key with offset of -1 found, there would have to exist an + * inode with such number or a root with such id. + */ + ret = -EUCLEAN; + goto fail; + } - BUG_ON(ret == 0); /* Key with offset of -1 found */ if (path->slots[0] == 0) { ret = -ENOENT; goto fail; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index f32f4113c976..5afb7ca42828 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,7 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, + u64 root_objectid, u64 generation, int check_generation); struct dentry *btrfs_get_parent(struct dentry *child); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 19d2104c0462..a28b0eafb65a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -895,6 +895,11 @@ again: err = -ENOENT; goto out; } else if (WARN_ON(ret)) { + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); err = -EIO; goto out; } @@ -1238,7 +1243,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 bytes_left, end; u64 aligned_start = ALIGN(start, 1 << 9); - if (WARN_ON(start != aligned_start)) { + /* Adjust the range to be aligned to 512B sectors if necessary. */ + if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, 1 << 9); start = aligned_start; @@ -1676,12 +1682,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); @@ -3989,8 +3995,11 @@ have_block_group: ret = 0; } - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { + if (!cache_block_group_error) + cache_block_group_error = -EIO; goto loop; + } /* * Ok we want to try and use the cluster allocator, so diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 95ddeb477797..04788940afaf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4024,11 +4024,12 @@ retry: free_extent_buffer(eb); /* - * the filesystem may choose to bump up nr_to_write. + * The filesystem may choose to bump up nr_to_write. * We have to make sure to honor the new nr_to_write - * at any time + * at any time. */ - nr_to_write_done = wbc->nr_to_write <= 0; + nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && + wbc->nr_to_write <= 0); } pagevec_release(&pvec); cond_resched(); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 61b82c69eed5..1a7183cdfe95 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -499,7 +499,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - BUG_ON(!sums); /* -ENOMEM */ + if (!sums) + return BLK_STS_RESOURCE; + sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d2d32fed8f2e..0cb93f73acb2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -784,15 +784,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); if (ret) { + spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7755a0362a3a..c89e85a7da7d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6830,7 +6830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6894,7 +6894,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -7039,7 +7039,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_fail; @@ -9751,8 +9751,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - root_log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9768,8 +9766,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, @@ -9797,6 +9793,29 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + * + * We pin the logs even if at this precise moment none of the inodes was + * logged before. This is because right after we checked for that, some + * other task fsyncing some other inode not involved with this rename + * operation could log that one of our inodes exists. + * + * We don't need to pin the logs before the above calls to + * btrfs_insert_inode_ref(), since those don't ever need to change a log. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(root); + root_log_pinned = true; + } + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + } + /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9911,7 +9930,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) return ret; @@ -10046,8 +10065,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -10071,6 +10088,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { + /* + * Now pin the log. We do it to ensure that no other task can + * sync the log while we are in progress with the rename, as + * that could result in an inconsistency in case any of the + * inodes that are part of this rename operation were logged + * before. + * + * We pin the log even if at this precise moment none of the + * inodes was logged before. This is because right after we + * checked for that, some other task fsyncing some other inode + * not involved with this rename operation could log that one of + * our inodes exists. + * + * We don't need to pin the logs before the above call to + * btrfs_insert_inode_ref(), since that does not need to change + * a log. + */ + btrfs_pin_log_trans(root); + log_pinned = true; ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, @@ -10380,7 +10416,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -10663,7 +10699,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 675112aa998f..674d774eb662 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1847,6 +1847,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, * are limited to own subvolumes only */ ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, @@ -2110,7 +2119,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -2242,7 +2251,7 @@ out: static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -2314,7 +2323,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, struct btrfs_ioctl_search_key sk; struct inode *inode; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2348,8 +2357,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, struct btrfs_ioctl_search_args_v2 args; struct inode *inode; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2824,6 +2833,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; @@ -2914,6 +2925,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -2925,7 +2938,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } @@ -3045,7 +3057,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ioctl_defrag_range_args *range; + struct btrfs_ioctl_defrag_range_args range = {0}; int ret; ret = mnt_want_write_file(file); @@ -3077,33 +3089,28 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - range = kzalloc(sizeof(*range), GFP_KERNEL); - if (!range) { - ret = -ENOMEM; - goto out; - } - if (argp) { - if (copy_from_user(range, argp, - sizeof(*range))) { + if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; - kfree(range); + goto out; + } + if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { + ret = -EOPNOTSUPP; goto out; } /* compression requires us to start the IO */ - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { - range->flags |= BTRFS_DEFRAG_RANGE_START_IO; - range->extent_thresh = (u32)-1; + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range.flags |= BTRFS_DEFRAG_RANGE_START_IO; + range.extent_thresh = (u32)-1; } } else { /* the rest are all set to zero by kzalloc */ - range->len = (u64)-1; + range.len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), file, - range, BTRFS_OLDEST_GENERATION, 0); + &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; - kfree(range); break; default: ret = -EINVAL; @@ -3296,13 +3303,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) { - strncpy(di_args->path, rcu_str_deref(dev->name), - sizeof(di_args->path) - 1); - di_args->path[sizeof(di_args->path) - 1] = 0; - } else { + if (dev->name) + strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path)); + else di_args->path[0] = '\0'; - } out: rcu_read_unlock(); @@ -4143,7 +4147,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -4339,6 +4343,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { ret = mnt_want_write_file(file); if (ret) @@ -4515,6 +4524,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { @@ -4585,21 +4596,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, build_ino_list, inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4611,7 +4621,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); @@ -4912,7 +4921,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) } /* update qgroup status and info */ + mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); + mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); @@ -4954,6 +4965,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } + if (sa->create && is_fstree(sa->qgroupid)) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -5508,7 +5524,7 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f4edadf1067f..cec6c283a691 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -109,10 +109,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) pr_cont("shared data backref parent %llu count %u\n", offset, btrfs_shared_data_ref_count(eb, sref)); /* - * offset is supposed to be a tree block which - * must be aligned to nodesize. + * Offset is supposed to be a tree block which must be + * aligned to sectorsize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) pr_info( "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5a3006c75d63..d6184b6206ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -504,6 +504,49 @@ out: return ret < 0 ? ret : 0; } +static u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + +/* + * Called in close_ctree() when quota is still enabled. This verifies we don't + * leak some reserved space. + * + * Return false if no reserved space is left. + * Return true if some reserved space is leaked. + */ +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + bool ret = false; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return ret; + /* + * Since we're unmounting, there is no race and no need to grab qgroup + * lock. And here we don't go post-order to provide a more user + * friendly sorted result. + */ + for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { + struct btrfs_qgroup *qgroup; + int i; + + qgroup = rb_entry(node, struct btrfs_qgroup, node); + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { + if (qgroup->rsv.values[i]) { + ret = true; + btrfs_warn(fs_info, + "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + i, qgroup->rsv.values[i]); + } + } + } + return ret; +} + /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set @@ -1075,6 +1118,21 @@ out_add_root: fs_info->qgroup_rescan_running = true; btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); + } else { + /* + * We have set both BTRFS_FS_QUOTA_ENABLED and + * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with + * -EINPROGRESS. That can happen because someone started the + * rescan worker by calling quota rescan ioctl before we + * attempted to initialize the rescan worker. Failure due to + * quotas disabled in the meanwhile is not possible, because + * we are holding a write lock on fs_info->subvol_sem, which + * is also acquired when disabling quotas. + * Ignore such error, and any other error would need to undo + * everything we did in the transaction we just committed. + */ + ASSERT(ret == -EINPROGRESS); + ret = 0; } out_free_path: @@ -1106,12 +1164,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) int ret = 0; /* - * We need to have subvol_sem write locked, to prevent races between - * concurrent tasks trying to disable quotas, because we will unlock - * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * We need to have subvol_sem write locked to prevent races with + * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); + /* + * Lock the cleaner mutex to prevent races with concurrent relocation, + * because relocation may be building backrefs for blocks of the quota + * root while we are deleting the root. This is like dropping fs roots + * of deleted snapshots/subvolumes, we need the same protection. + * + * This also prevents races between concurrent tasks trying to disable + * quotas, because we will unlock and relock qgroup_ioctl_lock across + * BTRFS_FS_QUOTA_ENABLED changes. + */ + mutex_lock(&fs_info->cleaner_mutex); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; @@ -1174,7 +1243,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) goto out; } + spin_lock(&fs_info->trans_lock); list_del("a_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clean_tree_block(quota_root->node); @@ -1191,6 +1262,7 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); + mutex_unlock(&fs_info->cleaner_mutex); return ret; } @@ -1323,7 +1395,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1339,9 +1410,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, return -ENOMEM; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); @@ -1387,7 +1457,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1400,9 +1469,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, if (!tmp) return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1467,11 +1535,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } + quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; @@ -1493,18 +1561,25 @@ out: return ret; } +static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) +{ + return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || + qgroup->excl > 0 || qgroup->excl_cmpr > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || + qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); +} + int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1514,6 +1589,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { + ret = -EBUSY; + goto out; + } + /* Check if there are no children of this qgroup */ if (!list_empty(&qgroup->members)) { ret = -EBUSY; @@ -1545,7 +1625,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. @@ -1555,9 +1634,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -2657,15 +2735,23 @@ cleanup: } /* - * called from commit_transaction. Writes all changed qgroups to disk. + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; - if (!quota_root) + /* + * In case we are called from the qgroup assign ioctl, assert that we + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota + * disable operation (ioctl) and access a freed quota root. + */ + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); + + if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); @@ -2809,14 +2895,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { @@ -2935,7 +3014,6 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; @@ -2954,8 +3032,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enforce = false; spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -3022,7 +3099,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -3040,8 +3116,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, } spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -3188,7 +3263,8 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3200,6 +3276,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; bool stopped = false; + bool did_leaf_rescans = false; path = btrfs_alloc_path(); if (!path) @@ -3218,11 +3295,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = PTR_ERR(trans); break; } - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - err = -EINTR; - } else { - err = qgroup_rescan_leaf(trans, path); - } + + err = qgroup_rescan_leaf(trans, path); + did_leaf_rescans = true; + if (err > 0) btrfs_commit_transaction(trans); else @@ -3236,22 +3312,29 @@ out: if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0) { + } else if (err < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has already updated the - * qgroup info. + * Only update status, since the previous part has already updated the + * qgroup info, and only if we did any actual work. This also prevents + * race with a concurrent quota disable, which has already set + * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at + * btrfs_quota_disable(). */ - trans = btrfs_start_transaction(fs_info->quota_root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (did_leaf_rescans) { + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); + } + } else { trans = NULL; - btrfs_err(fs_info, - "fail to start transaction for status update: %d", - err); } mutex_lock(&fs_info->qgroup_rescan_lock); @@ -3924,7 +4007,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { - struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -3932,7 +4014,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, if (num_bytes == 0) return; - if (!quota_root) + if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); @@ -3979,6 +4061,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + if (!sb_rdonly(fs_info->sb)) + add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* @@ -4267,4 +4351,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) ulist_free(entry->old_roots); kfree(entry); } + *root = RB_ROOT; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 0a2659685ad6..94bdfb89505e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -416,5 +416,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7ac679ed2b6c..226a17a335da 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -334,6 +334,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap, + dest->stripe_npages); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -878,6 +881,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1212,6 +1221,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1285,6 +1297,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (stripe = 0; stripe < rbio->real_stripes; stripe++) { for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(pagenr, rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); if (!page) @@ -1309,6 +1326,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(pagenr, rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); if (!page) @@ -1748,6 +1770,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bbio->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * rbio->stripe_len); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + PAGE_SHIFT) % rbio->stripe_npages; + + set_bit(bit, rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ @@ -1764,9 +1813,8 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_put_bbio(bbio); return PTR_ERR(rbio); } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; @@ -2068,9 +2116,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { if (rbio->faila == stripe || rbio->failb == stripe) { @@ -2079,16 +2130,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - - /* - * the rmw code may have already read this - * page in - */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) - continue; - ret = rbio_add_io_page(rbio, &bio_list, rbio_stripe_page(rbio, stripe, pagenr), stripe, pagenr, rbio->stripe_len); @@ -2170,8 +2211,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index a97dc74a4d3d..02f15321cecc 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) (len * sizeof(char)), mask); if (!ret) return ret; - strncpy(ret->str, src, len); + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index bbd63535965c..d59e89ef3251 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -888,8 +888,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, out_unlock: spin_unlock(&fs_info->ref_verify_lock); out: - if (ret) + if (ret) { + btrfs_free_ref_cache(fs_info); btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + } return ret; } @@ -1018,8 +1020,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) } } if (ret) { - btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); btrfs_free_ref_cache(fs_info); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); } btrfs_free_path(path); return ret; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ba68b0b41dff..e603cc8c141e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2511,7 +2511,7 @@ again: list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans); + err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; @@ -4102,8 +4102,12 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans); - return 0; + + ret = btrfs_commit_transaction(trans); + if (ret) + unset_reloc_control(rc); + + return ret; } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -4263,7 +4267,9 @@ restart: err = PTR_ERR(trans); goto out_free; } - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0d07ebe511e7..ba4e198811a4 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -371,9 +371,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + err = ret; goto out; - if (ret == 0) { + } else if (ret == 0) { leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e5db948daa12..45809f75692e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3849,6 +3849,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + bool need_commit = false; if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3961,6 +3962,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + u64 old_super_errors; + + spin_lock(&sctx->stat_lock); + old_super_errors = sctx->stat.super_errors; + spin_unlock(&sctx->stat_lock); + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can @@ -3969,6 +3976,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->fs_devices->device_list_mutex); ret = scrub_supers(sctx, dev); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + spin_lock(&sctx->stat_lock); + /* + * Super block errors found, but we can not commit transaction + * at current context, since btrfs_commit_transaction() needs + * to pause the current running scrub (hold by ourselves). + */ + if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) + need_commit = true; + spin_unlock(&sctx->stat_lock); } if (!ret) @@ -3995,6 +4012,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); scrub_put_ctx(sctx); + /* + * We found some super block errors before, now try to force a + * transaction commit, as scrub has finished. + */ + if (need_commit) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_err(fs_info, + "scrub: failed to start transaction to fix super block errors: %d", ret); + return ret; + } + ret = btrfs_commit_transaction(trans); + if (ret < 0) + btrfs_err(fs_info, + "scrub: failed to commit transaction to fix super block errors: %d", ret); + } return ret; out: scrub_workers_put(fs_info); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e258fc484cea..576c027909f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -973,7 +973,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, ret = PTR_ERR(start); goto out; } - BUG_ON(start < p->buf); + if (unlikely(start < p->buf)) { + btrfs_err(root->fs_info, + "send: path ref buffer underflow for key (%llu %u %llu)", + found_key->objectid, + found_key->type, + found_key->offset); + ret = -EINVAL; + goto out; + } } p->start = start; } else { @@ -5405,6 +5413,7 @@ static int clone_range(struct send_ctx *sctx, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5461,8 +5470,10 @@ static int clone_range(struct send_ctx *sctx, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5522,6 +5533,25 @@ static int clone_range(struct send_ctx *sctx, ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { ret = send_extent_data(sctx, offset, clone_len); } @@ -7325,10 +7355,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside - * access_ok. + * access_ok. Also set an upper limit for allocation size so this can't + * easily exhaust memory. Max number of clone sources is about 200K. */ - if (arg->clone_sources_count > - ULONG_MAX / sizeof(struct clone_root) - 1) { + if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { ret = -EINVAL; goto out; } @@ -7341,7 +7371,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) } if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { - ret = -EINVAL; + ret = -EOPNOTSUPP; goto out; } @@ -7359,7 +7389,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->flags = arg->flags; sctx->send_filp = fget(arg->send_fd); - if (!sctx->send_filp) { + if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1a69bdb96fb2..ea8b5b2d859d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1567,8 +1567,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; - if (!strstr(crc32c_impl(), "generic")) - set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -2137,7 +2135,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5c299e1f2297..356f2274c8eb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1154,8 +1154,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 98f9684e7ffc..8e413c82190e 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -189,7 +189,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void btrfs_free_dummy_root(struct btrfs_root *root) { - if (!root) + if (IS_ERR_OR_NULL(root)) return; /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ac035a6fa003..f6169bece7c0 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -230,21 +230,21 @@ static int test_no_shared_qgroup(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -256,31 +256,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */ + old_roots = NULL; + new_roots = NULL; + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, nodesize, nodesize)) { test_err("qgroup counts didn't match expected values"); return -EINVAL; } - old_roots = NULL; - new_roots = NULL; ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_item(root, nodesize, nodesize); - if (ret) + if (ret) { + ulist_free(old_roots); return -EINVAL; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -331,21 +333,21 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -366,21 +368,21 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } @@ -407,21 +409,21 @@ static int test_multiple_refs(struct btrfs_root *root, ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { - ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); - if (ret) + if (ret) { + ulist_free(old_roots); return ret; + } ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); - ulist_free(new_roots); test_err("couldn't find old roots: %d", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e6cb95b81787..89ffc0255406 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -190,10 +190,11 @@ loop: spin_unlock(&fs_info->trans_lock); /* - * If we are ATTACH, we just want to catch the current transaction, - * and commit it. If there is no transaction, just return ENOENT. + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. */ - if (type == TRANS_ATTACH) + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* @@ -706,8 +707,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); - if (trans == ERR_PTR(-ENOENT)) - btrfs_wait_for_commit(root->fs_info, 0); + if (trans == ERR_PTR(-ENOENT)) { + int ret; + + ret = btrfs_wait_for_commit(root->fs_info, 0); + if (ret) + return ERR_PTR(ret); + } return trans; } @@ -771,6 +777,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) } wait_for_commit(cur_trans); + ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); out: return ret; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 368c43c6cbd0..597362eaf300 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1019,7 +1019,8 @@ static void extent_err(const struct extent_buffer *eb, int slot, } static int check_extent_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot) + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; @@ -1164,7 +1165,7 @@ static int check_extent_item(struct extent_buffer *leaf, if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", - ptr, inline_type, end); + ptr, btrfs_extent_inline_ref_size(inline_type), end); return -EUCLEAN; } @@ -1230,6 +1231,26 @@ static int check_extent_item(struct extent_buffer *leaf, total_refs, inline_refs); return -EUCLEAN; } + + if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) || + (prev_key->type == BTRFS_METADATA_ITEM_KEY)) { + u64 prev_end = prev_key->objectid; + + if (prev_key->type == BTRFS_METADATA_ITEM_KEY) + prev_end += fs_info->nodesize; + else + prev_end += prev_key->offset; + + if (unlikely(prev_end > key->objectid)) { + extent_err(leaf, slot, + "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", + prev_key->objectid, prev_key->type, + prev_key->offset, key->objectid, key->type, + key->offset); + return -EUCLEAN; + } + } + return 0; } @@ -1343,7 +1364,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_EXTENT_ITEM_KEY: case BTRFS_METADATA_ITEM_KEY: - ret = check_extent_item(leaf, key, slot); + ret = check_extent_item(leaf, key, slot, prev_key); break; case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_DATA_REF_KEY: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9b703c0db979..f75333d7b78a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -918,8 +918,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); if (IS_ERR(di)) { - if (PTR_ERR(di) != -ENOENT) - ret = PTR_ERR(di); + ret = PTR_ERR(di); goto out; } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); @@ -1100,7 +1099,9 @@ again: extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, inode_objectid, parent_objectid, 0, 0); - if (!IS_ERR_OR_NULL(extref)) { + if (IS_ERR(extref)) { + return PTR_ERR(extref); + } else if (extref) { u32 item_size; u32 cur_offset = 0; unsigned long base; @@ -1169,8 +1170,7 @@ next: di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); if (IS_ERR(di)) { - if (PTR_ERR(di) != -ENOENT) - return PTR_ERR(di); + return PTR_ERR(di); } else if (di) { ret = drop_one_dir_item(trans, root, path, dir, di); if (ret) @@ -2020,9 +2020,6 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; } - if (dst_di == ERR_PTR(-ENOENT)) - dst_di = NULL; - if (IS_ERR(dst_di)) { ret = PTR_ERR(dst_di); goto out; @@ -2307,7 +2304,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { + if (!log_di) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -3520,8 +3517,7 @@ out_unlock: if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ + } else if (err < 0) { btrfs_abort_transaction(trans, err); } @@ -4287,7 +4283,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC)) @@ -5295,6 +5291,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -5707,7 +5715,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, 0, LLONG_MAX, ctx); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8898682c9103..d7014b2b28d6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -354,6 +354,7 @@ void btrfs_free_device(struct btrfs_device *device) static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -713,15 +714,47 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } -static bool device_path_matched(const char *path, struct btrfs_device *device) +/* + * Check if the device in the path matches the device in the given struct device. + * + * Returns: + * true If it is the same device. + * false If it is not the same device or on error. + */ +static bool device_matched(const struct btrfs_device *device, const char *path) { - int found; + char *device_name; + struct block_device *bdev_old; + struct block_device *bdev_new; + + /* + * If we are looking for a device with the matching dev_t, then skip + * device without a name (a missing device). + */ + if (!device->name) + return false; + + device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); + if (!device_name) + return false; rcu_read_lock(); - found = strcmp(rcu_str_deref(device->name), path); + scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); rcu_read_unlock(); - return found == 0; + bdev_old = lookup_bdev(device_name); + kfree(device_name); + if (IS_ERR(bdev_old)) + return false; + + bdev_new = lookup_bdev(path); + if (IS_ERR(bdev_new)) + return false; + + if (bdev_old == bdev_new) + return true; + + return false; } /* @@ -754,9 +787,7 @@ static int btrfs_free_stale_devices(const char *path, &fs_devices->devices, dev_list) { if (skip_device && skip_device == device) continue; - if (path && !device->name) - continue; - if (path && !device_path_matched(path, device)) + if (path && !device_matched(device, path)) continue; if (fs_devices->opened) { /* for an already deleted device return 0 */ @@ -864,6 +895,14 @@ error_brelse: return -EINVAL; } +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +{ + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; +} + /* * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices * being created with a disk that has already completed its fsid change. @@ -1371,6 +1410,17 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; + + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } } mutex_unlock(&uuid_mutex); @@ -1537,8 +1587,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ bytenr = btrfs_sb_offset(0); - flags |= FMODE_EXCL; + /* + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may + * initiate the device scan which may race with the user's mount + * or mkfs command, resulting in failure. + * Since the device scan is solely for reading purposes, there is + * no need for FMODE_EXCL. Additionally, the devices are read again + * during the mount process. It is ok to get some inconsistent + * values temporarily, as the device paths of the fsid are the only + * required information for assembling the volume. + */ bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1579,7 +1638,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, if (in_range(physical_start, *start, len) || in_range(*start, physical_start, - physical_end - physical_start)) { + physical_end + 1 - physical_start)) { *start = physical_end + 1; return true; } @@ -1671,7 +1730,7 @@ again: goto out; } - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1694,6 +1753,9 @@ again: if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; @@ -1764,6 +1826,7 @@ next: else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -3027,15 +3090,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, read_unlock(&em_tree->lock); if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu length %llu", + btrfs_crit(fs_info, + "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len < logical) { + if (em->start > logical || em->start + em->len <= logical) { btrfs_crit(fs_info, - "found a bad mapping, wanted %llu-%llu, found %llu-%llu", - logical, length, em->start, em->start + em->len); + "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", + logical, logical + length, em->start, em->start + em->len); free_extent_map(em); return ERR_PTR(-EINVAL); } @@ -3204,7 +3268,17 @@ again: mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; } - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* + * On the first search we would find chunk tree with + * offset -1, which is not possible. On subsequent + * loops this would find an existing item on an invalid + * offset (one less than the previous one, wrong + * alignment and size). + */ + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); @@ -4503,8 +4577,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) } } - BUG_ON(fs_info->balance_ctl || - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; @@ -7383,12 +7456,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aa6a6d7b2978..762c0a375498 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -581,4 +581,7 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 48858510739b..cd7ddf24157a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -387,6 +387,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, const char *name, const void *buffer, size_t size, int flags) { + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index df1aace5df50..9385d0bb276d 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -74,7 +74,7 @@ static struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); workspace->level = level; workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) |