aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c116
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/block-rsv.c5
-rw-r--r--fs/btrfs/block-rsv.h16
-rw-r--r--fs/btrfs/ctree.c34
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/delalloc-space.c3
-rw-r--r--fs/btrfs/delayed-inode.c22
-rw-r--r--fs/btrfs/dev-replace.c29
-rw-r--r--fs/btrfs/dir-item.c122
-rw-r--r--fs/btrfs/disk-io.c62
-rw-r--r--fs/btrfs/export.c11
-rw-r--r--fs/btrfs/export.h2
-rw-r--r--fs/btrfs/extent-tree.c19
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/free-space-cache.c7
-rw-r--r--fs/btrfs/inode.c60
-rw-r--r--fs/btrfs/ioctl.c94
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/qgroup.c197
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c74
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/relocation.c14
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/scrub.c36
-rw-r--r--fs/btrfs/send.c44
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/sysfs.c7
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c36
-rw-r--r--fs/btrfs/transaction.c17
-rw-r--r--fs/btrfs/tree-checker.c27
-rw-r--r--fs/btrfs/tree-log.c34
-rw-r--r--fs/btrfs/volumes.c115
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c3
-rw-r--r--fs/btrfs/zlib.c2
40 files changed, 913 insertions, 344 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c701a19fac53..16e5e07cb5d1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -136,6 +136,7 @@ struct share_check {
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -286,8 +287,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -428,6 +431,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
u64 data_offset;
+ u8 type;
if (level != 0) {
eb = path->nodes[level];
@@ -482,6 +486,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(eb, fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ goto next;
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
data_offset = btrfs_file_extent_offset(eb, fi);
@@ -641,6 +648,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -755,7 +774,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -812,16 +835,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -847,10 +865,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -876,13 +900,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -912,7 +945,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1015,7 +1048,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1025,6 +1059,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1114,7 +1149,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1353,6 +1389,12 @@ again:
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1378,6 +1420,14 @@ again:
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1394,24 +1444,6 @@ out:
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1537,6 +1569,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
ulist_init(roots);
@@ -1571,6 +1604,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
break;
bytenr = node->val;
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
@@ -2257,20 +2291,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kvmalloc(alloc_bytes, GFP_KERNEL);
+ data = kvzalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
- if (total_bytes >= sizeof(*data)) {
+ if (total_bytes >= sizeof(*data))
data->bytes_left = total_bytes - sizeof(*data);
- data->bytes_missing = 0;
- } else {
+ else
data->bytes_missing = sizeof(*data) - total_bytes;
- data->bytes_left = 0;
- }
-
- data->elem_cnt = 0;
- data->elem_missed = 0;
return data;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bcf19dfb0af3..278933cd3a09 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2938,6 +2938,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 343400d49bd1..63205d2f4d84 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -29,7 +29,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
} else {
num_bytes = 0;
}
- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+ if (qgroup_to_release_ret &&
+ block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
qgroup_to_release = block_rsv->qgroup_rsv_reserved -
block_rsv->qgroup_rsv_size;
block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
@@ -391,7 +392,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb73fc5..69770360917c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -98,4 +98,20 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0);
}
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 822c615840e8..608e41b61689 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3589,6 +3589,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5138,10 +5140,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
struct btrfs_key key;
+ struct btrfs_key orig_key;
struct btrfs_disk_key found_key;
int ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+ orig_key = key;
if (key.offset > 0) {
key.offset--;
@@ -5158,8 +5162,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret <= 0)
return ret;
+
+ /*
+ * Previous key not found. Even if we were at slot 0 of the leaf we had
+ * before releasing the path and calling btrfs_search_slot(), we now may
+ * be in a slot pointing to the same original key - this can happen if
+ * after we released the path, one of more items were moved from a
+ * sibling leaf into the front of the leaf we had due to an insertion
+ * (see push_leaf_right()).
+ * If we hit this case and our slot is > 0 and just decrement the slot
+ * so that the caller does not process the same key again, which may or
+ * may not break the caller, depending on its logic.
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
+ ret = comp_keys(&found_key, &orig_key);
+ if (ret == 0) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ return 0;
+ }
+ /*
+ * At slot 0, same key as before, it means orig_key is
+ * the lowest, leftmost, key in the tree. We're done.
+ */
+ return 1;
+ }
+ }
+
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cd77c0621a55..b141a7ba4507 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -482,8 +482,6 @@ struct btrfs_swapfile_pin {
bool is_block_group;
};
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
@@ -2727,7 +2725,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index f4f531c4aa96..d3f16dc33d0f 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -324,9 +324,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
}
if (delalloc_lock)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e96890475bac..95afe5ef7500 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1137,6 +1137,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_record_root_in_trans(trans, node->root);
+ if (ret)
+ return ret;
ret = btrfs_update_delayed_inode(trans, node->root, path, node);
return ret;
}
@@ -1175,20 +1178,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
- btrfs_release_delayed_node(curr_node);
- curr_node = NULL;
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
curr_node = btrfs_next_delayed_node(curr_node);
+ /*
+ * See the comment below about releasing path before releasing
+ * node. If the commit of delayed items was successful the path
+ * should always be released, but in case of an error, it may
+ * point to locked extent buffers (a leaf at the very least).
+ */
+ ASSERT(path->nodes[0] == NULL);
btrfs_release_delayed_node(prev_node);
}
+ /*
+ * Release the path to avoid a potential deadlock and lockdep splat when
+ * releasing the delayed node, as that requires taking the delayed node's
+ * mutex. If another task starts running delayed items before we take
+ * the mutex, it will first lock the mutex and then it may try to lock
+ * the same btree path (leaf).
+ */
+ btrfs_free_path(path);
+
if (curr_node)
btrfs_release_delayed_node(curr_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1cb7f5d79765..4abc0db6527e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -125,7 +125,7 @@ no_valid_dev_replace_entry_found:
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -535,6 +535,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -547,10 +564,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
@@ -918,8 +934,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 863367c2c620..98c6faa8ce15 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -171,10 +171,40 @@ out_free:
return 0;
}
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -182,23 +212,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
-
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -212,7 +237,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int slot;
struct btrfs_path *path;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -221,20 +245,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- /* return back any errors */
- if (ret < 0)
- goto out;
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
- /* nothing found, we're safe */
- if (ret > 0) {
- ret = 0;
- goto out;
+ if (ret < 0)
+ goto out;
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -261,35 +285,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
- int ret;
+ struct btrfs_dir_item *di;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
@@ -346,21 +377,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f18c6d97932e..b4ed11b5f148 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2239,6 +2239,23 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ /*
+ * Check if the checksum implementation is a fast accelerated one.
+ * As-is this is a bit of a hack and should be replaced once the csum
+ * implementations provide that information themselves.
+ */
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+ break;
+ default:
+ break;
+ }
+
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
@@ -2463,21 +2480,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
- fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ sb->fsid, fs_info->fs_devices->fsid);
ret = -EINVAL;
}
- if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
- memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
+ BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
- fs_info->super_copy->metadata_uuid,
- fs_info->fs_devices->metadata_uuid);
+ btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
ret = -EINVAL;
}
@@ -2927,7 +2941,7 @@ int open_ctree(struct super_block *sb,
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_csum;
@@ -2965,11 +2979,25 @@ int open_ctree(struct super_block *sb,
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_csum;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
@@ -4075,6 +4103,11 @@ void close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
@@ -4377,7 +4410,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
*/
inode = igrab(&btrfs_inode->vfs_inode);
if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
iput(inode);
}
spin_lock(&root->delalloc_lock);
@@ -4495,7 +4532,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
inode = cache->io_ctl.inode;
if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
+
BTRFS_I(inode)->generation = 0;
cache->io_ctl.inode = NULL;
iput(inode);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 93cceeba484c..08d1d456e2f0 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -182,8 +182,15 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
+ if (ret == 0) {
+ /*
+ * Key with offset of -1 found, there would have to exist an
+ * inode with such number or a root with such id.
+ */
+ ret = -EUCLEAN;
+ goto fail;
+ }
- BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 19d2104c0462..a28b0eafb65a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -895,6 +895,11 @@ again:
err = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(fs_info,
+"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
+ bytenr, num_bytes, parent, root_objectid, owner,
+ offset);
err = -EIO;
goto out;
}
@@ -1238,7 +1243,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << 9);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << 9);
start = aligned_start;
@@ -1676,12 +1682,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- if (node->ref_mod != 1) {
+ if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
- "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
- return -EIO;
+ return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
@@ -3989,8 +3995,11 @@ have_block_group:
ret = 0;
}
- if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+ if (!cache_block_group_error)
+ cache_block_group_error = -EIO;
goto loop;
+ }
/*
* Ok we want to try and use the cluster allocator, so
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 95ddeb477797..04788940afaf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4024,11 +4024,12 @@ retry:
free_extent_buffer(eb);
/*
- * the filesystem may choose to bump up nr_to_write.
+ * The filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
- * at any time
+ * at any time.
*/
- nr_to_write_done = wbc->nr_to_write <= 0;
+ nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nr_to_write <= 0);
}
pagevec_release(&pvec);
cond_resched();
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 61b82c69eed5..1a7183cdfe95 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -499,7 +499,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
bytes_left), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
- BUG_ON(!sums); /* -ENOMEM */
+ if (!sums)
+ return BLK_STS_RESOURCE;
+
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
offset);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d2d32fed8f2e..0cb93f73acb2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -784,15 +784,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
- ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
- spin_unlock(&ctl->tree_lock);
if (ret) {
+ spin_unlock(&ctl->tree_lock);
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
list_add_tail(&e->list, &bitmaps);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7755a0362a3a..c89e85a7da7d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6830,7 +6830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6894,7 +6894,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -7039,7 +7039,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_fail;
@@ -9751,8 +9751,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9768,8 +9766,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
@@ -9797,6 +9793,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9911,7 +9930,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
u64 objectid;
u64 index;
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
return ret;
@@ -10046,8 +10065,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -10071,6 +10088,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
@@ -10380,7 +10416,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -10663,7 +10699,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 675112aa998f..674d774eb662 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1847,6 +1847,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
* are limited to own subvolumes only
*/
ret = -EPERM;
+ } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * Snapshots must be made with the src_inode referring
+ * to the subvolume inode, otherwise the permission
+ * checking above is useless because we may have
+ * permission on a lower directory but not the subvol
+ * itself.
+ */
+ ret = -EINVAL;
} else {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
@@ -2110,7 +2119,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -2242,7 +2251,7 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -2314,7 +2323,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
struct btrfs_ioctl_search_key sk;
struct inode *inode;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2348,8 +2357,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct btrfs_ioctl_search_args_v2 args;
struct inode *inode;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2824,6 +2833,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -2914,6 +2925,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -2925,7 +2938,6 @@ out:
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3045,7 +3057,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_defrag_range_args *range;
+ struct btrfs_ioctl_defrag_range_args range = {0};
int ret;
ret = mnt_want_write_file(file);
@@ -3077,33 +3089,28 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- range = kzalloc(sizeof(*range), GFP_KERNEL);
- if (!range) {
- ret = -ENOMEM;
- goto out;
- }
-
if (argp) {
- if (copy_from_user(range, argp,
- sizeof(*range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
- kfree(range);
+ goto out;
+ }
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
goto out;
}
/* compression requires us to start the IO */
- if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
- range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
- range->extent_thresh = (u32)-1;
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
- range->len = (u64)-1;
+ range.len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, BTRFS_OLDEST_GENERATION, 0);
+ &range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
- kfree(range);
break;
default:
ret = -EINVAL;
@@ -3296,13 +3303,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name) {
- strncpy(di_args->path, rcu_str_deref(dev->name),
- sizeof(di_args->path) - 1);
- di_args->path[sizeof(di_args->path) - 1] = 0;
- } else {
+ if (dev->name)
+ strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path));
+ else
di_args->path[0] = '\0';
- }
out:
rcu_read_unlock();
@@ -4143,7 +4147,7 @@ static void get_block_group_info(struct list_head *groups_list,
static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_space_args space_args;
+ struct btrfs_ioctl_space_args space_args = { 0 };
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
@@ -4339,6 +4343,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
ret = mnt_want_write_file(file);
if (ret)
@@ -4515,6 +4524,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
@@ -4585,21 +4596,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_16M);
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
- inodes = NULL;
- goto out;
+ goto out_loi;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
build_ino_list, inodes, ignore_offset);
+ btrfs_free_path(path);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4611,7 +4621,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
ret = -EFAULT;
out:
- btrfs_free_path(path);
kvfree(inodes);
out_loi:
kfree(loi);
@@ -4912,7 +4921,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
}
/* update qgroup status and info */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
err = btrfs_run_qgroups(trans);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
btrfs_handle_fs_error(fs_info, err,
"failed to update qgroup status and info");
@@ -4954,6 +4965,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -5508,7 +5524,7 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
- struct btrfs_ioctl_send_args_32 args32;
+ struct btrfs_ioctl_send_args_32 args32 = { 0 };
ret = copy_from_user(&args32, argp, sizeof(args32));
if (ret)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f4edadf1067f..cec6c283a691 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -109,10 +109,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
pr_cont("shared data backref parent %llu count %u\n",
offset, btrfs_shared_data_ref_count(eb, sref));
/*
- * offset is supposed to be a tree block which
- * must be aligned to nodesize.
+ * Offset is supposed to be a tree block which must be
+ * aligned to sectorsize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
pr_info(
"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
offset, eb->fs_info->sectorsize);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5a3006c75d63..d6184b6206ff 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -504,6 +504,49 @@ out:
return ret < 0 ? ret : 0;
}
+static u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
+/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %llu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
@@ -1075,6 +1118,21 @@ out_add_root:
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
@@ -1106,12 +1164,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
int ret = 0;
/*
- * We need to have subvol_sem write locked, to prevent races between
- * concurrent tasks trying to disable quotas, because we will unlock
- * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ * We need to have subvol_sem write locked to prevent races with
+ * snapshot creation.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent relocation,
+ * because relocation may be building backrefs for blocks of the quota
+ * root while we are deleting the root. This is like dropping fs roots
+ * of deleted snapshots/subvolumes, we need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to disable
+ * quotas, because we will unlock and relock qgroup_ioctl_lock across
+ * BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
@@ -1174,7 +1243,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
goto out;
}
+ spin_lock(&fs_info->trans_lock);
list_del(&quota_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
@@ -1191,6 +1262,7 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->cleaner_mutex);
return ret;
}
@@ -1323,7 +1395,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1339,9 +1410,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
@@ -1387,7 +1457,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1400,9 +1469,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
if (!tmp)
return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1467,11 +1535,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
+ quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
@@ -1493,18 +1561,25 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1514,6 +1589,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
@@ -1545,7 +1625,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
@@ -1555,9 +1634,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -2657,15 +2735,23 @@ cleanup:
}
/*
- * called from commit_transaction. Writes all changed qgroups to disk.
+ * Writes all changed qgroups to disk.
+ * Called by the transaction commit path and the qgroup assign ioctl.
*/
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- if (!quota_root)
+ /*
+ * In case we are called from the qgroup assign ioctl, assert that we
+ * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
+ * disable operation (ioctl) and access a freed quota root.
+ */
+ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
+ lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
+
+ if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
@@ -2809,14 +2895,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
- ret = update_qgroup_limit_item(trans, dstgroup);
- if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_info(fs_info,
- "unable to update quota limit for %llu",
- dstgroup->qgroupid);
- goto unlock;
- }
+ qgroup_dirty(fs_info, dstgroup);
}
if (srcid) {
@@ -2935,7 +3014,6 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
@@ -2954,8 +3032,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enforce = false;
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3022,7 +3099,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3040,8 +3116,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
}
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3188,7 +3263,8 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3200,6 +3276,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
bool stopped = false;
+ bool did_leaf_rescans = false;
path = btrfs_alloc_path();
if (!path)
@@ -3218,11 +3295,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+ did_leaf_rescans = true;
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3236,22 +3312,29 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
- * only update status, since the previous part has already updated the
- * qgroup info.
+ * Only update status, since the previous part has already updated the
+ * qgroup info, and only if we did any actual work. This also prevents
+ * race with a concurrent quota disable, which has already set
+ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
+ * btrfs_quota_disable().
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 1);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (did_leaf_rescans) {
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d",
+ err);
+ }
+ } else {
trans = NULL;
- btrfs_err(fs_info,
- "fail to start transaction for status update: %d",
- err);
}
mutex_lock(&fs_info->qgroup_rescan_lock);
@@ -3924,7 +4007,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
- struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3932,7 +4014,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
if (num_bytes == 0)
return;
- if (!quota_root)
+ if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);
@@ -3979,6 +4061,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ if (!sb_rdonly(fs_info->sb))
+ add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
/*
@@ -4267,4 +4351,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
ulist_free(entry->old_roots);
kfree(entry);
}
+ *root = RB_ROOT;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 0a2659685ad6..94bdfb89505e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -416,5 +416,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 7ac679ed2b6c..226a17a335da 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -334,6 +334,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
+ dest->stripe_npages);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -878,6 +881,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1212,6 +1221,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1285,6 +1297,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1309,6 +1326,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1748,6 +1770,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * rbio->stripe_len);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ PAGE_SHIFT) % rbio->stripe_npages;
+
+ set_bit(bit, rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
@@ -1764,9 +1813,8 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
@@ -2068,9 +2116,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
@@ -2079,16 +2130,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
@@ -2170,8 +2211,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index a97dc74a4d3d..02f15321cecc 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
(len * sizeof(char)), mask);
if (!ret)
return ret;
- strncpy(ret->str, src, len);
+ /* Warn if the source got unexpectedly truncated. */
+ if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+ kfree(ret);
+ return NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index bbd63535965c..d59e89ef3251 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -888,8 +888,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1018,8 +1020,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ba68b0b41dff..e603cc8c141e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2511,7 +2511,7 @@ again:
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans);
+ err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -4102,8 +4102,12 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans);
- return 0;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -4263,7 +4267,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0d07ebe511e7..ba4e198811a4 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -371,9 +371,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ err = ret;
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e5db948daa12..45809f75692e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3849,6 +3849,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3961,6 +3962,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -3969,6 +3976,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -3995,6 +4012,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e258fc484cea..576c027909f8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -973,7 +973,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
ret = PTR_ERR(start);
goto out;
}
- BUG_ON(start < p->buf);
+ if (unlikely(start < p->buf)) {
+ btrfs_err(root->fs_info,
+ "send: path ref buffer underflow for key (%llu %u %llu)",
+ found_key->objectid,
+ found_key->type,
+ found_key->offset);
+ ret = -EINVAL;
+ goto out;
+ }
}
p->start = start;
} else {
@@ -5405,6 +5413,7 @@ static int clone_range(struct send_ctx *sctx,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5461,8 +5470,10 @@ static int clone_range(struct send_ctx *sctx,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5522,6 +5533,25 @@ static int clone_range(struct send_ctx *sctx,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, offset, clone_len);
}
@@ -7325,10 +7355,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
/*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
- * access_ok.
+ * access_ok. Also set an upper limit for allocation size so this can't
+ * easily exhaust memory. Max number of clone sources is about 200K.
*/
- if (arg->clone_sources_count >
- ULONG_MAX / sizeof(struct clone_root) - 1) {
+ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
ret = -EINVAL;
goto out;
}
@@ -7341,7 +7371,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -7359,7 +7389,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
sctx->send_filp = fget(arg->send_fd);
- if (!sctx->send_filp) {
+ if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a69bdb96fb2..ea8b5b2d859d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1567,8 +1567,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
- if (!strstr(crc32c_impl(), "generic"))
- set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
@@ -2137,7 +2135,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
- total_free_meta - thresh < block_rsv->size)
+ (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5c299e1f2297..356f2274c8eb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1154,8 +1154,11 @@ int __init btrfs_init_sysfs(void)
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 98f9684e7ffc..8e413c82190e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -189,7 +189,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ac035a6fa003..f6169bece7c0 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -230,21 +230,21 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -256,31 +256,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -331,21 +333,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -366,21 +368,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -407,21 +409,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e6cb95b81787..89ffc0255406 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -190,10 +190,11 @@ loop:
spin_unlock(&fs_info->trans_lock);
/*
- * If we are ATTACH, we just want to catch the current transaction,
- * and commit it. If there is no transaction, just return ENOENT.
+ * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
+ * current transaction, and commit it. If there is no transaction, just
+ * return ENOENT.
*/
- if (type == TRANS_ATTACH)
+ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
return -ENOENT;
/*
@@ -706,8 +707,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
trans = start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH, true);
- if (trans == ERR_PTR(-ENOENT))
- btrfs_wait_for_commit(root->fs_info, 0);
+ if (trans == ERR_PTR(-ENOENT)) {
+ int ret;
+
+ ret = btrfs_wait_for_commit(root->fs_info, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return trans;
}
@@ -771,6 +777,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
}
wait_for_commit(cur_trans);
+ ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
out:
return ret;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 368c43c6cbd0..597362eaf300 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1019,7 +1019,8 @@ static void extent_err(const struct extent_buffer *eb, int slot,
}
static int check_extent_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
@@ -1164,7 +1165,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
@@ -1230,6 +1231,26 @@ static int check_extent_item(struct extent_buffer *leaf,
total_refs, inline_refs);
return -EUCLEAN;
}
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
@@ -1343,7 +1364,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
- ret = check_extent_item(leaf, key, slot);
+ ret = check_extent_item(leaf, key, slot, prev_key);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9b703c0db979..f75333d7b78a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -918,8 +918,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
if (IS_ERR(di)) {
- if (PTR_ERR(di) != -ENOENT)
- ret = PTR_ERR(di);
+ ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
@@ -1100,7 +1099,9 @@ again:
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -1169,8 +1170,7 @@ next:
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
if (IS_ERR(di)) {
- if (PTR_ERR(di) != -ENOENT)
- return PTR_ERR(di);
+ return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
@@ -2020,9 +2020,6 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
goto out;
}
- if (dst_di == ERR_PTR(-ENOENT))
- dst_di = NULL;
-
if (IS_ERR(dst_di)) {
ret = PTR_ERR(dst_di);
goto out;
@@ -2307,7 +2304,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
+ if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -3520,8 +3517,7 @@ out_unlock:
if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
err = 0;
- } else if (err < 0 && err != -ENOENT) {
- /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ } else if (err < 0) {
btrfs_abort_transaction(trans, err);
}
@@ -4287,7 +4283,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
- int start_slot;
+ int start_slot = 0;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
@@ -5295,6 +5291,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5707,7 +5715,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, 0, LLONG_MAX, ctx);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8898682c9103..d7014b2b28d6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -354,6 +354,7 @@ void btrfs_free_device(struct btrfs_device *device)
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
+
WARN_ON(fs_devices->opened);
while (!list_empty(&fs_devices->devices)) {
device = list_entry(fs_devices->devices.next,
@@ -713,15 +714,47 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
+/*
+ * Check if the device in the path matches the device in the given struct device.
+ *
+ * Returns:
+ * true If it is the same device.
+ * false If it is not the same device or on error.
+ */
+static bool device_matched(const struct btrfs_device *device, const char *path)
{
- int found;
+ char *device_name;
+ struct block_device *bdev_old;
+ struct block_device *bdev_new;
+
+ /*
+ * If we are looking for a device with the matching dev_t, then skip
+ * device without a name (a missing device).
+ */
+ if (!device->name)
+ return false;
+
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+ if (!device_name)
+ return false;
rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
rcu_read_unlock();
- return found == 0;
+ bdev_old = lookup_bdev(device_name);
+ kfree(device_name);
+ if (IS_ERR(bdev_old))
+ return false;
+
+ bdev_new = lookup_bdev(path);
+ if (IS_ERR(bdev_new))
+ return false;
+
+ if (bdev_old == bdev_new)
+ return true;
+
+ return false;
}
/*
@@ -754,9 +787,7 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (path && !device_matched(device, path))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
@@ -864,6 +895,14 @@ error_brelse:
return -EINVAL;
}
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+{
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
+}
+
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change.
@@ -1371,6 +1410,17 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
+
+ /*
+ * If the struct btrfs_fs_devices is not assembled with any
+ * other device, it can be re-initialized during the next mount
+ * without the needing device-scan step. Therefore, it can be
+ * fully freed.
+ */
+ if (fs_devices->num_devices == 1) {
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
}
mutex_unlock(&uuid_mutex);
@@ -1537,8 +1587,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
bytenr = btrfs_sb_offset(0);
- flags |= FMODE_EXCL;
+ /*
+ * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
+ * initiate the device scan which may race with the user's mount
+ * or mkfs command, resulting in failure.
+ * Since the device scan is solely for reading purposes, there is
+ * no need for FMODE_EXCL. Additionally, the devices are read again
+ * during the mount process. It is ok to get some inconsistent
+ * values temporarily, as the device paths of the fsid are the only
+ * required information for assembling the volume.
+ */
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@@ -1579,7 +1638,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
- physical_end - physical_start)) {
+ physical_end + 1 - physical_start)) {
*start = physical_end + 1;
return true;
}
@@ -1671,7 +1730,7 @@ again:
goto out;
}
- while (1) {
+ while (search_start < search_end) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
@@ -1694,6 +1753,9 @@ again:
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
+ if (key.offset > search_end)
+ break;
+
if (key.offset > search_start) {
hole_size = key.offset - search_start;
@@ -1764,6 +1826,7 @@ next:
else
ret = 0;
+ ASSERT(max_hole_start + max_hole_size <= search_end);
out:
btrfs_free_path(path);
*start = max_hole_start;
@@ -3027,15 +3090,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
read_unlock(&em_tree->lock);
if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len < logical) {
+ if (em->start > logical || em->start + em->len <= logical) {
btrfs_crit(fs_info,
- "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
- logical, length, em->start, em->start + em->len);
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, em->start, em->start + em->len);
free_extent_map(em);
return ERR_PTR(-EINVAL);
}
@@ -3204,7 +3268,17 @@ again:
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
}
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * On the first search we would find chunk tree with
+ * offset -1, which is not possible. On subsequent
+ * loops this would find an existing item on an invalid
+ * offset (one less than the previous one, wrong
+ * alignment and size).
+ */
+ ret = -EUCLEAN;
+ goto error;
+ }
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
@@ -4503,8 +4577,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
}
}
- BUG_ON(fs_info->balance_ctl ||
- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
@@ -7383,12 +7456,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aa6a6d7b2978..762c0a375498 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -581,4 +581,7 @@ int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 48858510739b..cd7ddf24157a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -387,6 +387,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index df1aace5df50..9385d0bb276d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -74,7 +74,7 @@ static struct list_head *zlib_alloc_workspace(unsigned int level)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
- workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL);
workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)