aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/accessors.c98
-rw-r--r--fs/btrfs/accessors.h4
-rw-r--r--fs/btrfs/bio.c17
-rw-r--r--fs/btrfs/bio.h4
-rw-r--r--fs/btrfs/block-group.c249
-rw-r--r--fs/btrfs/block-group.h13
-rw-r--r--fs/btrfs/block-rsv.c2
-rw-r--r--fs/btrfs/block-rsv.h32
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/compression.c162
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c63
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/defrag.c15
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/delayed-inode.c109
-rw-r--r--fs/btrfs/dev-replace.c52
-rw-r--r--fs/btrfs/disk-io.c184
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-io-tree.c119
-rw-r--r--fs/btrfs/extent-io-tree.h18
-rw-r--r--fs/btrfs/extent-tree.c157
-rw-r--r--fs/btrfs/extent_io.c1273
-rw-r--r--fs/btrfs/extent_io.h80
-rw-r--r--fs/btrfs/extent_map.c195
-rw-r--r--fs/btrfs/extent_map.h77
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/fs.h18
-rw-r--r--fs/btrfs/inode.c225
-rw-r--r--fs/btrfs/ioctl.c26
-rw-r--r--fs/btrfs/lru_cache.c2
-rw-r--r--fs/btrfs/lzo.c38
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/ordered-data.c5
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/qgroup.c16
-rw-r--r--fs/btrfs/raid56.c7
-rw-r--r--fs/btrfs/raid56.h2
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/reflink.c6
-rw-r--r--fs/btrfs/relocation.c7
-rw-r--r--fs/btrfs/scrub.c99
-rw-r--r--fs/btrfs/send.c23
-rw-r--r--fs/btrfs/space-info.c26
-rw-r--r--fs/btrfs/subpage.c374
-rw-r--r--fs/btrfs/subpage.h82
-rw-r--r--fs/btrfs/super.c2299
-rw-r--r--fs/btrfs/super.h5
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c4
-rw-r--r--fs/btrfs/tests/extent-map-tests.c143
-rw-r--r--fs/btrfs/tests/inode-tests.c60
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/tree-checker.h2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c932
-rw-r--r--fs/btrfs/volumes.h47
-rw-r--r--fs/btrfs/xattr.c55
-rw-r--r--fs/btrfs/zlib.c79
-rw-r--r--fs/btrfs/zoned.c107
-rw-r--r--fs/btrfs/zoned.h14
-rw-r--r--fs/btrfs/zstd.c7
68 files changed, 4427 insertions, 3408 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..1925a0919ca6 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
+ token->kaddr = folio_address(eb->folios[0]);
token->offset = 0;
}
@@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
* phyusical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
@@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ return get_unaligned_le##bits(token->kaddr + oil); \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(token->kaddr + oil); \
\
- memcpy(lebytes, token->kaddr + oip, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes, token->kaddr + oil, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(lebytes + part, token->kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
- return get_unaligned_le##bits(kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(kaddr + oil); \
\
- memcpy(lebytes, kaddr + oip, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes, kaddr + oil, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(lebytes + part, kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oip, lebytes, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr + oil, lebytes, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(token->kaddr, lebytes + part, size - part); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, kaddr + oil); \
return; \
} \
\
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oip, lebytes, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr + oil, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(kaddr, lebytes + part, size - part); \
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index aa0844535644..ed7aa32972ad 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]) + \
+ const type *p = folio_address(eb->folios[0]) + \
offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 4f3b693a16b1..928f512cdb4a 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
+ /*
+ * We can only trigger this for data bio, which doesn't support larger
+ * folios yet.
+ */
+ ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- bv->bv_page, bv->bv_offset, mirror);
+ page_folio(bv->bv_page), bv->bv_offset, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
/*
* Submit bio to an async queue.
*
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
*/
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
@@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, length, pg_offset);
+ ret = bio_add_folio(&bio, folio, length, folio_offset);
+ ASSERT(ret);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..bbaed317161a 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6e5dc68ff661..378d9103a207 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);
kfree(cache->free_space_ctl);
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
@@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
@@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
@@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
out:
/* Once for the lookup reference */
@@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
@@ -1467,6 +1455,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1488,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1523,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1535,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if (space_info->total_bytes - block_group->length < used) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->bg_list, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1662,12 +1691,16 @@ next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -1927,8 +1960,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
@@ -1938,23 +1970,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
- read_unlock(&em_tree->lock);
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
- if (em->start != key->objectid || em->len != key->offset) {
+ if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
- key->objectid, key->offset, em->start, em->len);
+ key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
- goto out_free_em;
+ goto out_free_map;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -1962,16 +1991,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
-out_free_em:
- free_extent_map(em);
+out_free_map:
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2024,8 +2053,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -2033,14 +2061,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->orig_block_len;
+ data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
- chunk_start = em->start;
+ chunk_start = map->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2094,7 +2121,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2199,49 +2226,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
+ bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
+ if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
@@ -2369,28 +2394,25 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- struct extent_map *em;
- struct map_lookup *map;
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- em = rb_entry(node, struct extent_map, rb_node);
- map = em->map_lookup;
- bg = btrfs_create_block_group_cache(fs_info, em->start);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
- bg->length = em->len;
+ bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
- bg->used = em->len;
+ bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
@@ -2618,19 +2640,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_offset;
- u64 stripe_size;
int i;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2647,13 +2664,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
- stripe_size);
+ map->stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2712,6 +2729,37 @@ next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2910,7 +2958,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
goto unlock_out;
/*
- * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
* chunk allocation storm to exhaust the system chunk array. Otherwise
* we still want to try our best to mark the block group read-only.
*/
@@ -4406,8 +4454,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
@@ -4416,17 +4462,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
+
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2bdbcb834f95..962b11983901 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -5,6 +5,8 @@
#include "free-space-cache.h"
+struct btrfs_chunk_map;
+
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
@@ -243,7 +245,7 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
- struct map_lookup *physical_map;
+ struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
@@ -255,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
@@ -297,7 +306,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
+ struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..1043a8142351 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..43a9a6b5a79f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
return data_race(rsv->full);
}
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->reserved;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5572ae52444e..7f7c5a92d2b8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -69,6 +69,8 @@ enum {
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
+ /* Set when there are no capabilities in XATTs for the inode. */
+ BTRFS_INODE_NO_CAP_XATTR,
};
/* in memory btrfs inode */
@@ -107,9 +109,11 @@ struct btrfs_inode {
/*
* Keep track of where the inode has extent items mapped in order to
- * make sure the i_size adjustments are accurate
+ * make sure the i_size adjustments are accurate. Not required when the
+ * filesystem is NO_HOLES, the status can't be set while mounted as
+ * it's a mkfs-time feature.
*/
- struct extent_io_tree file_extent_tree;
+ struct extent_io_tree *file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end);
+ u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19b22b4653c8..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <linux/shrinker.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
@@ -140,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws,
static void btrfs_free_compressed_pages(struct compressed_bio *cb)
{
for (unsigned int i = 0; i < cb->nr_pages; i++)
- put_page(cb->compressed_pages[i]);
+ btrfs_free_compr_page(cb->compressed_pages[i]);
kfree(cb->compressed_pages);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+ struct shrinker *shrinker;
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+ int ret;
+
+ /*
+ * We must not read the values more than once if 'ret' gets expanded in
+ * the return statement so we don't accidentally return a negative
+ * number, even if the first condition finds it positive.
+ */
+ ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+ return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+ struct list_head remove;
+ struct list_head *tmp, *next;
+ int freed;
+
+ if (compr_pool.count == 0)
+ return SHRINK_STOP;
+
+ INIT_LIST_HEAD(&remove);
+
+ /* For now, just simply drain the whole list. */
+ spin_lock(&compr_pool.lock);
+ list_splice_init(&compr_pool.list, &remove);
+ freed = compr_pool.count;
+ compr_pool.count = 0;
+ spin_unlock(&compr_pool.lock);
+
+ list_for_each_safe(tmp, next, &remove) {
+ struct page *page = list_entry(tmp, struct page, lru);
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+ }
+
+ return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct page *btrfs_alloc_compr_page(void)
+{
+ struct page *page = NULL;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > 0) {
+ page = list_first_entry(&compr_pool.list, struct page, lru);
+ list_del_init(&page->lru);
+ compr_pool.count--;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (page)
+ return page;
+
+ return alloc_page(GFP_NOFS);
+}
+
+void btrfs_free_compr_page(struct page *page)
+{
+ bool do_free = false;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > compr_pool.thresh) {
+ do_free = true;
+ } else {
+ list_add(&page->lru, &compr_pool.list);
+ compr_pool.count++;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (!do_free)
+ return;
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+}
+
+static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
- cb->start, cb->len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio,
+ cb->start, cb->len);
}
folio_batch_release(&fbatch);
}
@@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_compressed_bio_write);
+ end_bbio_comprssed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
cb->compressed_pages = compressed_pages;
@@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ btrfs_subpage_start_reader(fs_info, page_folio(page),
+ cur, add_size);
put_page(page);
cur += add_size;
}
@@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out;
}
- ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(extent_map_is_compressed(em));
compressed_len = em->block_len;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_compressed_bio_read);
+ end_bbio_comprssed_read);
cb->start = em->orig_start;
em_len = em->len;
@@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = em->compress_type;
+ cb->compress_type = extent_map_compression(em);
cb->orig_bbio = bbio;
free_extent_map(em);
@@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -941,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -960,15 +1065,36 @@ int __init btrfs_init_compress(void)
offsetof(struct compressed_bio, bbio.bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+
+ compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+ if (!compr_pool.shrinker)
+ return -ENOMEM;
+
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+
+ spin_lock_init(&compr_pool.lock);
+ INIT_LIST_HEAD(&compr_pool.list);
+ compr_pool.count = 0;
+ /* 128K / 4K = 32, for 8 threads is 256 pages. */
+ compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+ compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+ compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+ compr_pool.shrinker->batch = 32;
+ compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+ shrinker_register(compr_pool.shrinker);
+
return 0;
}
void __cold btrfs_exit_compress(void)
{
+ /* For now scan drains all pages and does not touch the parameters. */
+ btrfs_compr_pool_scan(NULL, NULL);
+ shrinker_free(compr_pool.shrinker);
+
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 03bb9d143fa7..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
+struct page;
+
struct compressed_bio {
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+struct page *btrfs_alloc_compr_page(void);
+void btrfs_free_compr_page(struct page *page);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
@@ -143,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -154,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c1d24d4a78..e65e012bac55 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
{
+ const u64 buf_gen = btrfs_header_generation(buf);
+
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node &&
- (btrfs_header_generation(buf) <=
- btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
- if (buf != root->commit_root)
- return 1;
- /*
- * An extent buffer that used to be the commit root may still be
- * shared because the tree height may have increased and it
- * became a child of a higher level root. This can happen when
- * snapshotting a subvolume created in the current transaction.
- */
- if (btrfs_header_generation(buf) == trans->transid)
- return 1;
- }
- return 0;
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return false;
+
+ if (buf == root->node)
+ return false;
+
+ if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return false;
+
+ if (buf != root->commit_root)
+ return true;
+
+ /*
+ * An extent buffer that used to be the commit root may still be shared
+ * because the tree height may have increased and it became a child of a
+ * higher level root. This can happen when snapshotting a subvolume
+ * created in the current transaction.
+ */
+ if (buf_gen == trans->transid)
+ return true;
+
+ return false;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- unsigned long oip;
+ const int unit_size = folio_size(eb->folios[0]);
+ unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
@@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
mid = (low + high) / 2;
offset = p + mid * item_size;
- oip = offset_in_page(offset);
+ oil = get_eb_offset_in_folio(eb, offset);
- if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = get_eb_page_index(offset);
- char *kaddr = page_address(eb->pages[idx]);
+ if (oil + key_size <= unit_size) {
+ const unsigned long idx = get_eb_folio_index(eb, offset);
+ char *kaddr = folio_address(eb->folios[idx]);
- oip = get_eb_offset_in_page(eb, offset);
- tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ oil = get_eb_offset_in_folio(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oil);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 196c005c31f6..70e828d33177 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -212,8 +212,6 @@ struct btrfs_root {
u64 last_trans;
- u32 type;
-
u64 free_objectid;
struct btrfs_key defrag_progress;
@@ -224,18 +222,15 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t log_extents_lock[2];
- struct list_head logged_list[2];
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by @inode_lock.
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5244561e2016..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* this em, as either we don't care about the generation, or the
* merged extent map will be rejected anyway.
*/
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ if (em && (em->flags & EXTENT_FLAG_MERGED) &&
newer_than && em->generation >= newer_than) {
free_extent_map(em);
em = NULL;
@@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
/*
* If the next extent is at its max capacity, defragging current extent
@@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
em->len <= inode->root->fs_info->max_inline)
goto next;
- /* Skip hole/delalloc/preallocated extents */
+ /* Skip holes and preallocated extents. */
if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ (em->flags & EXTENT_FLAG_PREALLOC))
goto next;
/* Skip older extent */
@@ -1047,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
@@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c09..acf9f4b6c044 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
- csum_leaves);
+ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+ u64 csum_leaves;
+
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+ }
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ csum_leaves = 0;
+ else
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += disk_num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7381241334e8..08102883f560 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
int ret;
+ void *ptr;
again:
node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +132,30 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
+ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
+ /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+ if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
+ return ERR_PTR(-ENOMEM);
}
-
spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
+ ptr = xa_load(&root->delayed_nodes, ino);
+ if (ptr) {
+ /* Somebody inserted it, go back and read it. */
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
+ node = NULL;
goto again;
}
+ ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ASSERT(xa_err(ptr) != -EINVAL);
+ ASSERT(xa_err(ptr) != -ENOMEM);
+ ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf))
- goto search;
-again:
+ /*
+ * Now we're going to delete the INODE_REF/EXTREF, which should be the
+ * only one ref left. Check if the next item is an INODE_REF/EXTREF.
+ *
+ * But if we're the last item already, release and search for the last
+ * INODE_REF/EXTREF.
+ */
+ if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = (u64)-1;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret > 0);
+ ASSERT(path->slots[0] > 0);
+ ret = 0;
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ } else {
+ path->slots[0]++;
+ }
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != node->inode_id)
goto out;
-
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
goto out;
@@ -1070,22 +1093,6 @@ err_out:
btrfs_abort_transaction(trans, ret);
return ret;
-
-search:
- btrfs_release_path(path);
-
- key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = -1;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto err_out;
- ASSERT(ret);
-
- ret = 0;
- leaf = path->nodes[0];
- path->slots[0]--;
- goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ struct btrfs_delayed_node *node;
+ int count;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ count = 0;
+ xa_for_each_start(&root->delayed_nodes, index, node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&node->refs)) {
+ delayed_nodes[count] = node;
+ count++;
+ }
+ if (count >= ARRAY_SIZE(delayed_nodes))
+ break;
}
spin_unlock(&root->inode_lock);
+ index++;
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f9544fda38e9..79c4293ddf37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
@@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
spin_unlock(&cache->lock);
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- ASSERT(!IS_ERR(em));
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(map));
num_extents = 0;
cur_extent = 0;
@@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
cur_extent = i;
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
@@ -727,6 +725,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -739,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
@@ -812,25 +826,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
u64 start = 0;
int i;
- write_lock(&em_tree->lock);
+ write_lock(&fs_info->mapping_tree_lock);
do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
+ if (!map)
break;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
} while (start);
- write_unlock(&em_tree->lock);
+ write_unlock(&fs_info->mapping_tree_lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62cb97f7c94f..c843563914ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + folio_size(folio));
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
@@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ /* GFP flags are compatible with XA_FLAGS_*. */
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -1286,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
- * pass 0 for new allocation.
+ * pass NULL for a new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev,
+ u64 objectid, dev_t *anon_dev,
bool check_ref)
{
struct btrfs_root *root;
@@ -1315,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- /* Shouldn't get preallocated anon_dev for cached roots */
- ASSERT(!anon_dev);
+ /*
+ * Some other caller may have read out the newly inserted
+ * subvolume already (for things like backref walk etc). Not
+ * that common but still possible. In that case, we just need
+ * to free the anon_dev.
+ */
+ if (unlikely(anon_dev && *anon_dev)) {
+ free_anon_bdev(*anon_dev);
+ *anon_dev = 0;
+ }
+
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1336,7 +1366,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root, anon_dev);
+ ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
if (ret)
goto fail;
@@ -1372,7 +1402,7 @@ fail:
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
- if (anon_dev)
+ if (anon_dev && *anon_dev)
root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1388,7 +1418,7 @@ fail:
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
- return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+ return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}
/*
@@ -1396,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
* the anonymous block device id
*
* @objectid: tree objectid
- * @anon_dev: if zero, allocate a new anonymous block device or use the
- * parameter value
+ * @anon_dev: if NULL, allocate a new anonymous block device or use the
+ * parameter value if not NULL
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev)
+ u64 objectid, dev_t *anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
@@ -2618,9 +2648,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2724,7 +2751,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2822,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2931,17 +2962,6 @@ out:
}
/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
@@ -2953,7 +2973,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3276,13 +3300,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,28 +3313,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
-
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -3494,29 +3513,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3542,7 +3538,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3570,8 +3566,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3608,7 +3602,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -4391,7 +4385,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 50dab8f639dc..eb3473d1c1ac 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
- struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
@@ -64,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev);
+ u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ea149be28dff..e3ee5449cc4a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct btrfs_inode *inode = tree->inode;
+ const struct btrfs_inode *inode;
u64 isize;
- if (!inode)
+ if (tree->owner != IO_TREE_INODE_IO)
return;
+ inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
+
/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
*/
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+ return tree->owner == IO_TREE_INODE_IO;
+}
+
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode->root->fs_info;
+ return tree->fs_info;
+}
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner)
{
- tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->inode = NULL;
+ tree->fs_info = fs_info;
tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
/*
@@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
+ btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ "extent io tree error on %s state start %llu end %llu",
+ opname, state->start, state->end);
}
static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
@@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
@@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, next);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
@@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_set_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
}
node = &(*node)->rb_right;
} else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, state->end);
return ERR_PTR(-EEXIST);
}
}
@@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->inode)
- btrfs_split_delalloc_extent(tree->inode, orig, split);
+ if (is_inode_io_tree(tree))
+ btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+ split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_clear_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+ bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -695,7 +718,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -717,7 +740,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
if (wake)
wake_up(&state->wq);
@@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state *state;
int ret = 1;
+ ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
@@ -1152,7 +1177,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -1200,7 +1225,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
@@ -1228,7 +1253,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1382,7 +1407,7 @@ hit_next:
}
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
goto out;
@@ -1430,7 +1455,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
@@ -1453,7 +1478,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 5602b0137fcd..ebe6390d65e9 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -87,9 +87,17 @@ enum {
struct extent_io_tree {
struct rb_root state;
- struct btrfs_fs_info *fs_info;
- /* Inode associated with this tree, or NULL. */
- struct btrfs_inode *inode;
+ /*
+ * The fs_info is needed for trace points, a tree attached to an inode
+ * needs the inode.
+ *
+ * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+ * accessed as inode->root->fs_info
+ */
+ union {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_inode *inode;
+ };
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -112,6 +120,10 @@ struct extent_state {
#endif
};
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01423670bc8a..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -3447,6 +3448,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_block_group *bg;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3460,67 +3462,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
}
- if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group *cache;
- bool must_pin = false;
-
- if (root_id != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, buf->start);
- if (!ret) {
- btrfs_redirty_list_add(trans->transaction, buf);
- goto out;
- }
- }
+ if (!last_ref)
+ return;
- cache = btrfs_lookup_block_group(fs_info, buf->start);
+ if (btrfs_header_generation(buf) != trans->transid)
+ goto out;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret)
goto out;
- }
+ }
- /*
- * If there are tree mod log users we may have recorded mod log
- * operations for this node. If we re-allocate this node we
- * could replay operations on this node that happened when it
- * existed in a completely different root. For example if it
- * was part of root A, then was reallocated to root B, and we
- * are doing a btrfs_old_search_slot(root b), we could replay
- * operations that happened when the block was part of root A,
- * giving us an inconsistent view of the btree.
- *
- * We are safe from races here because at this point no other
- * node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins we will not have an
- * existing log of operations on this node that we have to
- * contend with.
- */
- if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
- must_pin = true;
+ bg = btrfs_lookup_block_group(fs_info, buf->start);
- if (must_pin || btrfs_is_zoned(fs_info)) {
- btrfs_redirty_list_add(trans->transaction, buf);
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
- goto out;
- }
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
+ }
- WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
- btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_free_reserved_bytes(cache, buf->len, 0);
- btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+ || btrfs_is_zoned(fs_info)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
}
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(bg, buf->start, buf->len);
+ btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_put_block_group(bg);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
out:
- if (last_ref) {
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- }
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}
/* Can return -ENOMEM */
@@ -4300,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4310,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -5061,7 +5084,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
__btrfs_tree_lock(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
set_extent_buffer_uptodate(buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f724c54fc8e..8b4bef05e222 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
struct page *page, struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
+ struct folio *folio = page_folio(page);
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
- btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
- btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
- btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
- btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (page != locked_page && (page_ops & PAGE_UNLOCK))
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
static void __process_pages_contig(struct address_space *mapping,
@@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode,
goto out;
for (i = 0; i < found_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+ struct page *page = folio_page(folio, 0);
u32 len = end + 1 - start;
if (page == locked_page)
continue;
- if (btrfs_page_start_writer_lock(fs_info, page, start,
- len))
+ if (btrfs_folio_start_writer_lock(fs_info, folio, start,
+ len))
goto out;
if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start,
- len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start,
+ len);
goto out;
}
@@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start)
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate && btrfs_verify_page(page, start))
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
unlock_page(page);
else
- btrfs_subpage_end_reader(fs_info, page, start, len);
+ btrfs_subpage_end_reader(fs_info, folio, start, len);
}
/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback() if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
- u64 start = page_offset(page) + bvec->bv_offset;
- u32 len = bvec->bv_len;
+ u64 start = folio_pos(folio) + fi.offset;
+ u32 len = fi.length;
+
+ /* Only order 0 (single page) folios are allowed for data. */
+ ASSERT(folio_order(folio) == 0);
/* Our read/write should always be sector aligned. */
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ "partial page write in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page write with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page write with offset %zu and length %zu",
+ fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered,
+ folio_page(folio, 0), start, len, !error);
if (error)
- mapping_set_error(page->mapping, error);
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ mapping_set_error(folio->mapping, error);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
bio_put(bio);
@@ -562,98 +569,102 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
- ASSERT(PageLocked(page));
- if (!btrfs_is_subpage(fs_info, page))
+ struct folio *folio = page_folio(page);
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page));
- btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(folio_test_private(folio));
+ btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
}
/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
struct processed_extent processed = { 0 };
+ struct folio_iter fi;
/*
* The offset to the beginning of a bio, since one bio can never be
* larger than UINT_MAX, u32 here is enough.
*/
u32 bio_offset = 0;
- struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
u64 start;
u64 end;
u32 len;
+ /* For now only order 0 folios are supported for data. */
+ ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
- "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- bio->bi_iter.bi_sector, bio->bi_status,
+ "%s: bi_sector=%llu, err=%d, mirror=%u",
+ __func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
/*
* We always issue full-sector reads, but if some block in a
- * page fails to read, blk_update_request() will advance
+ * folio fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
- sectorsize))
+ "partial page read in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page read with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page read with offset %zu and length %zu",
+ fi.offset, fi.length);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
- len = bvec->bv_len;
+ start = folio_pos(folio) + fi.offset;
+ end = start + fi.length - 1;
+ len = fi.length;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
+ pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
* i_size.
*
- * Here we should only zero the range inside the bvec,
+ * Here we should only zero the range inside the folio,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive.
*/
- if (page->index == end_index && i_size <= end) {
- u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(start));
+ if (folio_index(folio) == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_folio(folio, i_size),
+ offset_in_folio(folio, start));
+ u32 zero_len = offset_in_folio(folio, end) + 1 -
+ zero_start;
- zero_user_segment(page, zero_start,
- offset_in_page(end) + 1);
+ folio_zero_range(folio, zero_start, zero_len);
}
}
/* Update page status and unlock. */
- end_page_read(page, uptodate, start, len);
+ end_page_read(folio_page(folio, 0), uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
@@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
* the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation.
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp)
{
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+ allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp,
+ nr_pages, page_array);
if (allocated == nr_pages)
return 0;
@@ -707,6 +721,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+{
+ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+ int num_pages = num_extent_pages(eb);
+ int ret;
+
+ ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < num_pages; i++)
+ eb->folios[i] = page_folio(page_array[i]);
+ return 0;
+}
+
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
unsigned int pg_offset)
@@ -861,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
} while (size);
}
-static int attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page,
- struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+ struct folio *folio,
+ struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -874,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
- if (page->mapping)
- lockdep_assert_held(&page->mapping->private_lock);
+ if (folio->mapping)
+ lockdep_assert_held(&folio->mapping->i_private_lock);
if (fs_info->nodesize >= PAGE_SIZE) {
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, eb);
else
- WARN_ON(page->private != (unsigned long)eb);
+ WARN_ON(folio_get_private(folio) != eb);
return 0;
}
/* Already mapped, just free prealloc */
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
btrfs_free_subpage(prealloc);
return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
- attach_page_private(page, prealloc);
+ folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, page,
- BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}
void clear_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_detach_subpage(fs_info, page);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_detach_subpage(fs_info, folio);
- detach_page_private(page);
+ folio_detach_private(folio);
}
static struct extent_map *
@@ -1001,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
- bio_ctrl->end_io_func = end_bio_extent_readpage;
+ bio_ctrl->end_io_func = end_bbio_data_read;
begin_page_read(fs_info, page);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
@@ -1027,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- compress_type = em->compress_type;
+ compress_type = extent_map_compression(em);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
@@ -1037,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
else
disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
/*
@@ -1074,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ if (compress_type != BTRFS_COMPRESS_NONE &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
@@ -1240,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct folio *folio = page_folio(page);
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
@@ -1252,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (!btrfs_is_subpage(fs_info, page)) {
+ if (!btrfs_is_subpage(fs_info, page->mapping)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -1305,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
return 1;
}
- bio_ctrl->end_io_func = end_bio_extent_writepage;
+ bio_ctrl->end_io_func = end_bbio_data_write;
while (cur <= end) {
u32 len = end - cur + 1;
u64 disk_bytenr;
@@ -1325,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, len);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
break;
}
@@ -1352,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
block_start = em->block_start;
disk_bytenr = em->block_start + extent_offset;
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
@@ -1377,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
cur - page_offset(page));
@@ -1385,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_page_assert_not_dirty(fs_info, page);
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
*nr_ret = nr;
return 0;
@@ -1607,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
if (!uptodate)
set_btree_ioerr(eb);
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ struct folio *folio = fi.folio;
+ u32 len = fi.length;
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
bio_offset += len;
}
@@ -1673,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, extent_buffer_write_end_io, eb);
+ eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
if (fs_info->nodesize < PAGE_SIZE) {
- struct page *p = eb->pages[0];
+ struct folio *folio = eb->folios[0];
+ bool ret;
- lock_page(p);
- btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ folio_lock(folio);
+ btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
eb->len)) {
- clear_page_dirty_for_io(p);
+ folio_clear_dirty_for_io(folio);
wbc->nr_to_write--;
}
- __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
- wbc_account_cgroup_owner(wbc, p, eb->len);
- unlock_page(p);
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ folio_unlock(folio);
} else {
- for (int i = 0; i < num_extent_pages(eb); i++) {
- struct page *p = eb->pages[i];
-
- lock_page(p);
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
- wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
- wbc->nr_to_write--;
- unlock_page(p);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ bool ret;
+
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+ folio_size(folio));
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
}
btrfs_submit_bio(bbio, 0);
@@ -1725,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
@@ -1732,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
/* Lock and write each dirty extent buffers in the range */
while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
u64 start;
@@ -1741,16 +1784,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
bit_start++;
continue;
}
@@ -1764,7 +1807,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1807,38 +1850,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
struct address_space *mapping = page->mapping;
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
int ret;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc);
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
if (eb == ctx->eb) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (!ret)
return 0;
@@ -2201,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
cur, cur_len, !ret);
mapping_set_error(page->mapping, ret);
}
- btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2352,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
write_unlock(&map->lock);
break;
}
- if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PINNED) ||
em->start != start) {
write_unlock(&map->lock);
free_extent_map(em);
@@ -2369,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
* extra reference on the em.
*/
if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ (em->flags & EXTENT_FLAG_LOGGING))
goto remove_em;
/*
* If it's in the list of modified extents, remove it
@@ -2436,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
+ u64 cache_end;
int ret = 0;
/* Set at the end of extent_fiemap(). */
@@ -2445,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
goto assign;
/*
- * Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cached one.
- * Not recoverable.
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()).
*
- * NOTE: Physical address can overlap, due to compression
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
*/
- if (cache->offset + cache->len > offset) {
- WARN_ON(1);
- return -EINVAL;
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
}
/*
@@ -2473,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
return 0;
}
+emit:
/* Not mergeable, need to submit cached one */
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
@@ -2645,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2822,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2841,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
-
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
- goto out_unlock;
+ goto out;
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/*
* No file extent item found, but we may have delalloc between
@@ -2866,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2889,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2911,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -2959,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
extent_gen,
backref_ctx);
if (ret < 0)
- goto out_unlock;
+ goto out;
else if (ret > 0)
flags |= FIEMAP_EXTENT_SHARED;
}
@@ -2970,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2981,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
next_item:
if (fatal_signal_pending(current)) {
ret = -EINTR;
- goto out_unlock;
+ goto out;
}
ret = fiemap_next_leaf_item(inode, path);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* No more file extent items for this inode. */
break;
@@ -3005,29 +3153,41 @@ check_eof_delalloc:
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
- goto out_unlock;
- prev_extent_end = lockend;
+ goto out;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3036,10 +3196,6 @@ check_eof_delalloc:
}
ret = emit_last_fiemap_cache(fieinfo, &cache);
-
-out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
btrfs_free_backref_share_ctx(backref_ctx);
@@ -3058,14 +3214,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- lockdep_assert_held(&page->mapping->private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- if (PagePrivate(page)) {
- subpage = (struct btrfs_subpage *)page->private;
+ if (folio_test_private(folio)) {
+ subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
/*
@@ -3078,21 +3234,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
return false;
}
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
- * For mapped eb, we're going to change the page private, which should
- * be done under the private_lock.
+ * For mapped eb, we're going to change the folio private, which should
+ * be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
- if (!PagePrivate(page)) {
+ if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
@@ -3101,66 +3257,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
+ * only clear folio if it's still connected to
* this eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
+ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- detach_page_private(page);
+ BUG_ON(folio_test_dirty(folio));
+ BUG_ON(folio_test_writeback(folio));
+ /* We need to make sure we haven't be attached to a new eb. */
+ folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
/*
- * For subpage, we can have dummy eb with page private. In this case,
- * we can directly detach the private as such page is only attached to
- * one dummy eb, no sharing.
+ * For subpage, we can have dummy eb with folio private attached. In
+ * this case, we can directly detach the private as such folio is only
+ * attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, page);
+ btrfs_detach_subpage(fs_info, folio);
return;
}
- btrfs_page_dec_eb_refs(fs_info, page);
+ btrfs_folio_dec_eb_refs(fs_info, folio);
/*
- * We can only detach the page private if there are no other ebs in the
+ * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!page_range_has_eb(fs_info, page))
- btrfs_detach_subpage(fs_info, page);
+ if (!folio_range_has_eb(fs_info, folio))
+ btrfs_detach_subpage(fs_info, folio);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
}
/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
- int i;
- int num_pages;
-
ASSERT(!extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+ struct folio *folio = eb->folios[i];
- if (!page)
+ if (!folio)
continue;
- detach_extent_buffer_page(eb, page);
+ detach_extent_buffer_folio(eb, folio);
- /* One for when we allocated the page */
- put_page(page);
+ /* One for when we allocated the folio. */
+ folio_put(folio);
}
}
@@ -3198,9 +3346,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
- int i;
struct extent_buffer *new;
- int num_pages = num_extent_pages(src);
+ int num_folios = num_extent_folios(src);
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3214,22 +3361,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = btrfs_alloc_page_array(num_pages, new->pages);
+ ret = alloc_eb_folio_array(new, 0);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
}
- for (i = 0; i < num_pages; i++) {
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = new->folios[i];
int ret;
- struct page *p = new->pages[i];
- ret = attach_extent_buffer_page(new, p, NULL);
+ ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- WARN_ON(PageDirty(p));
+ WARN_ON(folio_test_dirty(folio));
}
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
@@ -3241,23 +3388,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
- int num_pages;
- int i;
+ int num_folios = 0;
int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
- num_pages = num_extent_pages(eb);
- ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ ret = alloc_eb_folio_array(eb, 0);
if (ret)
goto err;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = attach_extent_buffer_page(eb, p, NULL);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
goto err;
}
@@ -3268,10 +3412,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i]) {
- detach_extent_buffer_page(eb, eb->pages[i]);
- __free_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++) {
+ if (eb->folios[i]) {
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ __folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3320,20 +3464,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
- struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_pages, i;
+ int num_folios= num_extent_folios(eb);
check_buffer_tree_ref(eb);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (p != accessed)
- mark_page_accessed(p);
- }
+ for (int i = 0; i < num_folios; i++)
+ folio_mark_accessed(eb->folios[i]);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3361,7 +3499,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
- mark_extent_buffer_accessed(eb, NULL);
+ mark_extent_buffer_accessed(eb);
return eb;
}
@@ -3410,6 +3548,7 @@ free_eb:
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *exists;
/*
@@ -3421,21 +3560,21 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/* Page not yet attached to an extent buffer */
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
- * just overwrite page->private.
+ * just overwrite folio private.
*/
- exists = (struct extent_buffer *)page->private;
+ exists = folio_get_private(folio);
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
- detach_page_private(page);
+ folio_detach_private(folio);
return NULL;
}
@@ -3469,19 +3608,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return 0;
}
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct extent_buffer **found_eb_ret)
+{
+
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ const unsigned long index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
+ int ret;
+
+ ASSERT(found_eb_ret);
+
+ /* Caller should ensure the folio exists. */
+ ASSERT(eb->folios[i]);
+
+retry:
+ ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (!ret)
+ return 0;
+
+ existing_folio = filemap_lock_folio(mapping, index + i);
+ /* The page cache only exists for a very short time, just retry. */
+ if (IS_ERR(existing_folio))
+ goto retry;
+
+ /* For now, we should only have single-page folios for btree inode. */
+ ASSERT(folio_nr_pages(existing_folio) == 1);
+
+ if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return -EAGAIN;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE) {
+ /*
+ * We're going to reuse the existing page, can drop our page
+ * and subpage structure now.
+ */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ } else {
+ struct extent_buffer *existing_eb;
+
+ existing_eb = grab_extent_buffer(fs_info,
+ folio_page(existing_folio, 0));
+ if (existing_eb) {
+ /* The extent buffer still exists, we can use it directly. */
+ *found_eb_ret = existing_eb;
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return 1;
+ }
+ /* The extent buffer no longer exists, we can reuse the folio. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
- int num_pages;
- int i;
- unsigned long index = start >> PAGE_SHIFT;
+ int num_folios;
+ int attached = 0;
struct extent_buffer *eb;
- struct extent_buffer *exists = NULL;
- struct page *p;
+ struct extent_buffer *existing_eb = NULL;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
+ bool page_contig = true;
int uptodate = 1;
int ret;
@@ -3516,11 +3724,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- num_pages = num_extent_pages(eb);
-
/*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock nor page lock hold.
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
*
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
@@ -3528,47 +3734,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
- exists = ERR_CAST(prealloc);
- goto free_eb;
+ ret = PTR_ERR(prealloc);
+ goto out;
}
}
- for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p) {
- exists = ERR_PTR(-ENOMEM);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+reallocate:
+ /* Allocate all pages first. */
+ ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ if (ret < 0) {
+ btrfs_free_subpage(prealloc);
+ goto out;
+ }
+
+ num_folios = num_extent_folios(eb);
+ /* Attach all pages to the filemap. */
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio;
+
+ ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ if (ret > 0) {
+ ASSERT(existing_eb);
+ goto out;
}
- spin_lock(&mapping->private_lock);
- exists = grab_extent_buffer(fs_info, p);
- if (exists) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+ /*
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
+ *
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
+ if (unlikely(ret == -EAGAIN)) {
+ ASSERT(0);
+ goto reallocate;
}
+ attached++;
+
+ /*
+ * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+ * reliable, as we may choose to reuse the existing page cache
+ * and free the allocated page.
+ */
+ folio = eb->folios[i];
+ spin_lock(&mapping->i_private_lock);
/* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_page(eb, p, prealloc);
+ ret = attach_extent_buffer_folio(eb, folio, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the page private
+ * detach_extent_buffer_page() won't release the folio private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
- btrfs_page_inc_eb_refs(fs_info, p);
- spin_unlock(&mapping->private_lock);
+ btrfs_folio_inc_eb_refs(fs_info, folio);
+ spin_unlock(&mapping->i_private_lock);
+
+ WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
- WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
- eb->pages[i] = p;
- if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+ /*
+ * Check if the current page is physically contiguous with previous eb
+ * page.
+ * At this stage, either we allocated a large folio, thus @i
+ * would only be 0, or we fall back to per-page allocation.
+ */
+ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+ page_contig = false;
+
+ if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
uptodate = 0;
/*
@@ -3581,12 +3829,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ /* All pages are physically contiguous, can skip cross page handling. */
+ if (page_contig)
+ eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
+ if (ret)
+ goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3594,9 +3843,10 @@ again:
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
+ ret = 0;
+ existing_eb = find_extent_buffer(fs_info, start);
+ if (existing_eb)
+ goto out;
else
goto again;
}
@@ -3609,19 +3859,46 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (i = 0; i < num_pages; i++)
- unlock_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++)
+ unlock_page(folio_page(eb->folios[i], 0));
return eb;
-free_eb:
+out:
WARN_ON(!atomic_dec_and_test(&eb->refs));
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
+
+ /*
+ * Any attached folios need to be detached before we unlock them. This
+ * is because when we're inserting our new folios into the mapping, and
+ * then attaching our eb to that folio. If we fail to insert our folio
+ * we'll lookup the folio for that index, and grab that EB. We do not
+ * want that to grab this eb, as we're getting ready to free it. So we
+ * have to detach it first and then unlock it.
+ *
+ * We have to drop our reference and NULL it out here because in the
+ * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+ * Below when we call btrfs_release_extent_buffer() we will call
+ * detach_extent_buffer_folio() on our remaining pages in the !subpage
+ * case. If we left eb->folios[i] populated in the subpage case we'd
+ * double put our reference and be super sad.
+ */
+ for (int i = 0; i < attached; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ unlock_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
}
+ /*
+ * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+ * so it can be cleaned up without utlizing page->mapping.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
btrfs_release_extent_buffer(eb);
- return exists;
+ if (ret < 0)
+ return ERR_PTR(ret);
+ ASSERT(existing_eb);
+ return existing_eb;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3713,31 +3990,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
{
- ASSERT(PageDirty(page));
- ASSERT(PageLocked(page));
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ ASSERT(folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ folio_clear_dirty_for_io(folio);
+ xa_lock_irq(&folio->mapping->i_pages);
+ if (!folio_test_dirty(folio))
+ __xa_clear_mark(&folio->mapping->i_pages,
+ folio_index(folio), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
+ struct folio *folio = eb->folios[0];
bool last;
- /* btree_clear_page_dirty() needs page locked */
- lock_page(page);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
- eb->len);
+ /* btree_clear_folio_dirty() needs page locked. */
+ folio_lock(folio);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
if (last)
- btree_clear_page_dirty(page);
- unlock_page(page);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
WARN_ON(atomic_read(&eb->refs) == 0);
}
@@ -3745,15 +4021,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i;
- int num_pages;
- struct page *page;
+ int num_folios;
btrfs_assert_tree_write_locked(eb);
if (trans && btrfs_header_generation(eb) != trans->transid)
return;
+ /*
+ * Instead of clearing the dirty flag off of the buffer, mark it as
+ * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+ * write-ordering in zoned mode, without the need to later re-dirty
+ * the extent_buffer.
+ *
+ * The actual zeroout of the buffer will happen later in
+ * btree_csum_one_bio.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+ return;
+ }
+
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
@@ -3763,30 +4051,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
continue;
- lock_page(page);
- btree_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int i;
- int num_pages;
+ int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
@@ -3805,34 +4092,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
* the above race.
*/
if (subpage)
- lock_page(eb->pages[0]);
- for (i = 0; i < num_pages; i++)
- btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
- eb->start, eb->len);
+ lock_page(folio_page(eb->folios[0], 0));
+ for (int i = 0; i < num_folios; i++)
+ btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+ eb->start, eb->len);
if (subpage)
- unlock_page(eb->pages[0]);
+ unlock_page(folio_page(eb->folios[0], 0));
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (i = 0; i < num_pages; i++)
- ASSERT(PageDirty(eb->pages[i]));
+ for (int i = 0; i < num_folios; i++)
+ ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!page)
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (!folio)
continue;
/*
@@ -3840,44 +4125,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
else
- btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_clear_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
/*
* This is special handling for metadata subpage, as regular
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
else
- btrfs_subpage_set_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_set_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
eb->read_mirror = bbio->mirror_num;
@@ -3893,15 +4174,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
+ struct folio *folio = fi.folio;
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ u32 len = fi.length;
if (uptodate)
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
bio_offset += len;
}
@@ -3917,8 +4198,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *check)
{
- int num_pages = num_extent_pages(eb), i;
struct btrfs_bio *bbio;
+ bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3942,17 +4223,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
- extent_buffer_read_end_io, eb);
+ end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
if (eb->fs_info->nodesize < PAGE_SIZE) {
- __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
- eb->start - page_offset(eb->pages[0]));
+ ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+ eb->start - folio_pos(eb->folios[0]));
+ ASSERT(ret);
} else {
- for (i = 0; i < num_pages; i++)
- __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ }
}
btrfs_submit_bio(bbio, mirror_num);
@@ -3999,29 +4287,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
if (check_eb_range(eb, start, len)) {
/*
* Invalid range hit, reset the memory, so callers won't get
- * some random garbage for their uninitialzed memory.
+ * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len);
return;
}
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ memcpy(dstv, eb->addr + start, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
memcpy(dst, kaddr + offset, cur);
dst += cur;
@@ -4035,24 +4327,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (copy_to_user_nofault(dstv, eb->addr + start, len))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
@@ -4070,25 +4367,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr)
+ return memcmp(ptrv, eb->addr + start, len);
- while (len > 0) {
- page = eb->pages[i];
-
- cur = min(len, (PAGE_SIZE - offset));
+ offset = get_eb_offset_in_folio(eb, start);
- kaddr = page_address(page);
+ while (len > 0) {
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
@@ -4107,10 +4404,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
- struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct folio *folio = eb->folios[i];
+
+ ASSERT(folio);
/*
* If we are using the commit root we could potentially clear a page
@@ -4124,11 +4423,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
return;
if (fs_info->nodesize < PAGE_SIZE) {
- if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ struct folio *folio = eb->folios[0];
+
+ ASSERT(i == 0);
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
- btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+ btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
}
}
@@ -4136,29 +4438,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
- WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
if (check_eb_range(eb, start, len))
return;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (use_memmove)
+ memmove(eb->addr + start, srcv, len);
+ else
+ memcpy(eb->addr + start, srcv, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
if (check_uptodate)
- assert_eb_page_uptodate(eb, page);
+ assert_eb_folio_uptodate(eb, i);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (use_memmove)
memmove(kaddr + offset, src, cur);
else
@@ -4180,16 +4487,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
unsigned long cur = start;
+ if (eb->addr) {
+ memset(eb->addr + start, c, len);
+ return;
+ }
+
while (cur < start + len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned int offset = get_eb_offset_in_page(eb, cur);
- unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
- struct page *page = eb->pages[index];
+ unsigned long index = get_eb_folio_index(eb, cur);
+ unsigned int offset = get_eb_offset_in_folio(eb, cur);
+ unsigned int cur_len = min(start + len - cur, unit_size - offset);
- assert_eb_page_uptodate(eb, page);
- memset(page_address(page) + offset, c, cur_len);
+ assert_eb_folio_uptodate(eb, index);
+ memset(folio_address(eb->folios[index]) + offset, c, cur_len);
cur += cur_len;
}
@@ -4206,15 +4518,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
+ const int unit_size = folio_size(src->folios[0]);
unsigned long cur = 0;
ASSERT(dst->len == src->len);
while (cur < src->len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned long offset = get_eb_offset_in_page(src, cur);
- unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
- void *addr = page_address(src->pages[index]) + offset;
+ unsigned long index = get_eb_folio_index(src, cur);
+ unsigned long offset = get_eb_offset_in_folio(src, cur);
+ unsigned long cur_len = min(src->len, unit_size - offset);
+ void *addr = folio_address(src->folios[index]) + offset;
write_extent_buffer(dst, addr, cur, cur_len);
@@ -4227,12 +4540,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
u64 dst_len = dst->len;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- unsigned long i = get_eb_page_index(dst_offset);
+ unsigned long i = get_eb_folio_index(dst, dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -4240,15 +4553,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = get_eb_offset_in_page(dst, dst_offset);
+ offset = get_eb_offset_in_folio(dst, dst_offset);
while (len > 0) {
- page = dst->pages[i];
- assert_eb_page_uptodate(dst, page);
+ assert_eb_folio_uptodate(dst, i);
- cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+ cur = min(len, (unsigned long)(unit_size - offset));
- kaddr = page_address(page);
+ kaddr = folio_address(dst->folios[i]);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
@@ -4259,22 +4571,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * Calculate the page and offset of the byte containing the given bit number.
+ * Calculate the folio and offset of the byte containing the given bit number.
*
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains
+ * @folio_index: return index of the folio in the extent buffer that contains
* the given bit number
- * @page_offset: return offset into the page given by page_index
+ * @folio_offset: return offset into the folio given by folio_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
- unsigned long *page_index,
- size_t *page_offset)
+ unsigned long *folio_index,
+ size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -4284,10 +4596,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_page(eb->start) + byte_offset;
+ offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
- *page_index = offset >> PAGE_SHIFT;
- *page_offset = offset_in_page(offset);
+ *folio_index = offset >> folio_shift(eb->folios[0]);
+ *folio_offset = offset_in_folio(eb->folios[0], offset);
}
/*
@@ -4300,25 +4612,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
- u8 *kaddr;
- struct page *page;
unsigned long i;
size_t offset;
+ u8 *kaddr;
eb_bitmap_offset(eb, start, nr, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ assert_eb_folio_uptodate(eb, i);
+ kaddr = folio_address(eb->folios[i]);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
- unsigned long index = get_eb_page_index(bytenr);
+ unsigned long index = get_eb_folio_index(eb, bytenr);
if (check_eb_range(eb, bytenr, 1))
return NULL;
- return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+ return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}
/*
@@ -4403,19 +4713,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+ if (dst->addr) {
+ const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+ if (use_memmove)
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ else
+ memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (cur_off < len) {
unsigned long cur_src = cur_off + src_offset;
- unsigned long pg_index = get_eb_page_index(cur_src);
- unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+ unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
unsigned long cur_len = min(src_offset + len - cur_src,
- PAGE_SIZE - pg_off);
- void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ unit_size - folio_off);
+ void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
const bool use_memmove = areas_overlap(src_offset + cur_off,
dst_offset + cur_off, cur_len);
@@ -4441,24 +4762,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
+ if (dst->addr) {
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (len > 0) {
unsigned long src_i;
size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
+ size_t dst_off_in_folio;
+ size_t src_off_in_folio;
void *src_addr;
bool use_memmove;
- src_i = get_eb_page_index(src_end);
+ src_i = get_eb_folio_index(dst, src_end);
- dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
- src_off_in_page = get_eb_offset_in_page(dst, src_end);
+ dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+ src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
- cur = min_t(unsigned long, len, src_off_in_page + 1);
- cur = min(cur, dst_off_in_page + 1);
+ cur = min_t(unsigned long, len, src_off_in_folio + 1);
+ cur = min(cur, dst_off_in_folio + 1);
- src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
- cur + 1;
+ src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+ cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
@@ -4520,7 +4846,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
struct extent_buffer *eb = NULL;
/*
- * Unlike try_release_extent_buffer() which uses page->private
+ * Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
@@ -4560,43 +4886,44 @@ static int try_release_subpage_extent_buffer(struct page *page)
/*
* Here we don't care about the return value, we will always
- * check the page private at the end. And
+ * check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
- * Finally to check if we have cleared page private, as if we have
- * released all ebs in the page, the page private should be cleared now.
+ * Finally to check if we have cleared folio private, as if we have
+ * released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page))
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(page_folio(page)))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return ret;
}
int try_release_extent_buffer(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
- * We need to make sure nobody is changing page->private, as we rely on
- * page->private as the pointer to extent buffer.
+ * We need to make sure nobody is changing folio private, as we rely on
+ * folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
return 1;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
/*
@@ -4607,10 +4934,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..46050500529b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,8 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
- EXTENT_BUFFER_NO_CHECK,
+ /* Indicate the extent buffer is written zeroed out (for zoned) */
+ EXTENT_BUFFER_ZONED_ZEROOUT,
/* Indicate that extent buffer pages a being read */
EXTENT_BUFFER_READING,
};
@@ -43,10 +44,10 @@ enum {
};
/*
- * page->private values. Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values. Every page that is controlled by the extent map has
+ * folio private set to this value.
*/
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
@@ -77,6 +78,13 @@ struct extent_buffer {
unsigned long len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * The address where the eb can be accessed without any cross-page handling.
+ * This can be NULL if not possible.
+ */
+ void *addr;
+
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
@@ -86,7 +94,12 @@ struct extent_buffer {
struct rw_semaphore lock;
- struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ /*
+ * Pointers to all the folios of the extent buffer.
+ *
+ * For now the folio is always order 0 (aka, a single page).
+ */
+ struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
pid_t lock_owner;
@@ -108,29 +121,43 @@ struct btrfs_eb_write_context {
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb
+ * The eb->start is aligned to folio size, thus adding it
+ * won't cause any difference.
+ * 1.2) Several page sized folios
+ * The eb->start is aligned to folio (page) size, thus
+ * adding it won't cause any difference.
*
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * In this case there would only be one page sized folio, and there
+ * may be several different extent buffers in the page/folio.
+ * We need to add eb->start to properly access the offset inside
+ * that eb.
*/
- return offset_in_page(offset + eb->start);
+ return offset_in_folio(eb->folios[0], offset + eb->start);
}
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb.
+ * the folio_shift would be large enough to always make us
+ * return 0 as index.
+ * 1.2) Several page sized folios
+ * The folio_shift() would be PAGE_SHIFT, giving us the correct
+ * index.
*
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> PAGE_SHIFT;
+ return offset >> folio_shift(eb->folios[0]);
}
/*
@@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
return (eb->len >> PAGE_SHIFT) ?: 1;
}
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+ if (folio_order(eb->folios[0]))
+ return 1;
+ return num_extent_pages(eb);
+}
+
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a6d8368ed0ed..b61099bf97a8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->compress_type = BTRFS_COMPRESS_NONE;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em)
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
- return 0;
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
- /*
- * don't merge compressed extents, we need to know their
- * actual size
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
- return 0;
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_PINNED)
+ return false;
+
+ /* Don't merge compressed extents, we need to know their actual size. */
+ if (extent_map_is_compressed(em))
+ return false;
- if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
- test_bit(EXTENT_FLAG_LOGGING, &next->flags))
- return 0;
+ if (em->flags & EXTENT_FLAG_LOGGING)
+ return false;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
- if (!list_empty(&prev->list) || !list_empty(&next->list))
- return 0;
-
- ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
- prev->block_start != EXTENT_MAP_DELALLOC);
-
- if (prev->map_lookup || next->map_lookup)
- ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
- test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
- if (extent_map_end(prev) == next->start &&
- prev->flags == next->flags &&
- prev->map_lookup == next->map_lookup &&
- ((next->block_start == EXTENT_MAP_HOLE &&
- prev->block_start == EXTENT_MAP_HOLE) ||
- (next->block_start == EXTENT_MAP_INLINE &&
- prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
- next->block_start == extent_map_block_end(prev)))) {
- return 1;
- }
- return 0;
+ if (!list_empty(&em->list))
+ return false;
+
+ return true;
+}
+
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+ if (extent_map_end(prev) != next->start)
+ return false;
+
+ if (prev->flags != next->flags)
+ return false;
+
+ if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
+ return next->block_start == extent_map_block_end(prev);
+
+ /* HOLES and INLINE extents. */
+ return next->block_start == prev->block_start;
}
static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
@@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (refcount_read(&em->refs) > 2)
return;
+ if (!can_merge_extent_map(em))
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->orig_start = merge->orig_start;
em->len += merge->len;
@@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
}
}
@@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
/*
* Unpin an extent from the cache.
*
- * @tree: tree to unpin the extent in
+ * @inode: the inode from which we are unpinning an extent range
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
@@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
*/
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
- u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
bool prealloc = false;
@@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(!em || em->start != start);
-
- if (!em)
+ if (WARN_ON(!em)) {
+ btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ start, len, gen);
goto out;
+ }
+
+ if (WARN_ON(em->start != start))
+ btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ em->start, start, len, gen);
em->generation = gen;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
em->mod_start = em->start;
em->mod_len = em->len;
- if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_FILLING) {
prealloc = true;
- clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_FILLING;
}
try_merge_map(tree, em);
@@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
try_merge_map(tree, em);
}
@@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
em->mod_start = em->start;
em->mod_len = em->len;
+ ASSERT(list_empty(&em->list));
+
if (modified)
- list_move(&em->list, &tree->modified_extents);
+ list_add(&em->list, &tree->modified_extents);
else
try_merge_map(tree, em);
}
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
- }
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
- }
-}
-
/*
* Add new extent map to the extent tree
*
@@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
* into the tree directly, with an additional reference taken, or a
* reference dropped if the merge attempt was successful.
*/
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified)
+static int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified)
{
int ret = 0;
@@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
goto out;
setup_extent_mapping(tree, em, modified);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
- extent_map_device_set_bits(em, CHUNK_ALLOCATED);
- extent_map_device_clear_bits(em, CHUNK_TRIMMED);
- }
out:
return ret;
}
@@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ WARN_ON(em->flags & EXTENT_FLAG_PINNED);
rb_erase_cached(&em->rb_node, &tree->map);
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
}
@@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
- if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
@@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->start = start;
em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ !extent_map_is_compressed(em)) {
em->block_start += start_diff;
em->block_len = em->len;
}
@@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- ret = 0;
-
existing = search_extent_mapping(em_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
remove_extent_mapping(tree, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
@@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
}
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
start = em_end;
goto next;
}
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/*
* In case we split the extent map, we want to preserve the
* EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
* it on the new extent maps.
*/
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
modified = !list_empty(&em->list);
/*
@@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
@@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->len = em_end - end;
split->block_start = em->block_start;
split->flags = flags;
- split->compress_type = em->compress_type;
split->generation = gen;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
}
ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(em->flags & EXTENT_FLAG_PINNED);
+ ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
@@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
replace_extent_mapping(em_tree, em, split_pre, 1);
@@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, 1);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..e380fc08bbe4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -5,30 +5,33 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include "compression.h"
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
enum {
/* this entry not yet on disk, don't free it */
- EXTENT_FLAG_PINNED,
- EXTENT_FLAG_COMPRESSED,
+ ENUM_BIT(EXTENT_FLAG_PINNED),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
/* pre-allocated extent */
- EXTENT_FLAG_PREALLOC,
+ ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
- EXTENT_FLAG_LOGGING,
+ ENUM_BIT(EXTENT_FLAG_LOGGING),
/* Filling in a preallocated extent */
- EXTENT_FLAG_FILLING,
- /* filesystem extent mapping type */
- EXTENT_FLAG_FS_MAPPING,
+ ENUM_BIT(EXTENT_FLAG_FILLING),
/* This em is merged from two or more physically adjacent ems */
- EXTENT_FLAG_MERGED,
+ ENUM_BIT(EXTENT_FLAG_MERGED),
};
+/*
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
struct extent_map {
struct rb_node rb_node;
@@ -49,11 +52,8 @@ struct extent_map {
* For non-merged extents, it's from btrfs_file_extent_item::generation.
*/
u64 generation;
- unsigned long flags;
- /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
- struct map_lookup *map_lookup;
+ u32 flags;
refcount_t refs;
- unsigned int compress_type;
struct list_head list;
};
@@ -65,30 +65,57 @@ struct extent_map_tree {
struct btrfs_inode;
+static inline void extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
+{
+ if (type == BTRFS_COMPRESS_ZLIB)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ else if (type == BTRFS_COMPRESS_LZO)
+ em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+ else if (type == BTRFS_COMPRESS_ZSTD)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+ return BTRFS_COMPRESS_ZLIB;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+ return BTRFS_COMPRESS_LZO;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+ return BTRFS_COMPRESS_ZSTD;
+
+ return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+ return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+ EXTENT_FLAG_COMPRESS_LZO |
+ EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-static inline u64 extent_map_block_end(struct extent_map *em)
-{
- if (em->block_start + em->block_len < em->block_start)
- return (u64)-1;
- return em->block_start + em->block_len;
-}
-
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified);
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..81ac1d474bf1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
@@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return clear_extent_bit(&inode->file_extent_tree, start,
+ return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
return;
}
if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
+ extent_map_set_compression(em, compress_type);
em->block_start = bytenr;
em->block_len = em->orig_block_len;
} else {
@@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->block_start = bytenr;
em->block_len = em->len;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
em->block_start = EXTENT_MAP_INLINE;
@@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- em->compress_type = compress_type;
- if (compress_type != BTRFS_COMPRESS_NONE)
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32611a4edd6b..38dfcac47609 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
- block_len);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+ block_start, block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+ start_pos, num_bytes);
}
/*
@@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode,
* released.
*
* The private flag check is essential for subpage as we need
- * to store extra bitmap using page->private.
+ * to store extra bitmap using folio private.
*/
- if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page);
return -EAGAIN;
}
@@ -2150,7 +2153,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
if (em->block_start == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
- else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode,
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
- if (em->start <= alloc_start &&
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
@@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode,
goto out;
}
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC) {
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
@@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ !(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6f93c9a2c3e3..d372c7ce0e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_page_clear_checked(io_ctl->fs_info,
- io_ctl->pages[i],
+ btrfs_folio_clear_checked(io_ctl->fs_info,
+ page_folio(io_ctl->pages[i]),
page_offset(io_ctl->pages[i]),
PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 318df6f9d9cb..f8bb73d6ab68 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -188,6 +188,7 @@ enum {
BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
BTRFS_MOUNT_NODISCARD = (1UL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
};
/*
@@ -398,7 +399,8 @@ struct btrfs_fs_info {
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
+ struct rb_root_cached mapping_tree;
+ rwlock_t mapping_tree_lock;
/*
* Block reservation for extent, checksum, root tree and delayed dir
@@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb3c3f43c3fa..4795738d5785 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,6 +114,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -1037,7 +1046,7 @@ free_pages:
if (pages) {
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ btrfs_free_compr_page(pages[i]);
}
kfree(pages);
}
@@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ btrfs_free_compr_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -2793,7 +2802,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+ unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3175,8 +3184,23 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3796,7 +3820,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4449,6 +4473,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
u64 root_flags;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4460,25 +4486,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4488,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -4554,15 +4580,17 @@ out_end_trans:
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
@@ -4725,7 +4753,7 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
ret = set_page_extent_mapped(page);
@@ -4767,9 +4795,10 @@ again:
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
}
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4889,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
err = maybe_insert_hole(inode, cur_offset, hole_size);
@@ -4917,7 +4946,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -6217,6 +6245,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -7258,13 +7293,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_PREALLOC)
+ em->flags |= EXTENT_FLAG_FILLING;
+ else if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7304,10 +7337,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7542,7 +7575,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7638,7 +7671,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7802,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7827,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -7851,13 +7904,14 @@ static void btrfs_readahead(struct readahead_control *rac)
static void wait_subpage_spinlock(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7995,7 +8049,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8004,7 +8058,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8074,7 +8128,7 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
@@ -8098,6 +8152,7 @@ next:
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -8114,6 +8169,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
u64 page_end;
u64 end;
+ ASSERT(folio_order(folio) == 0);
+
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
@@ -8217,9 +8274,9 @@ again:
if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8462,10 +8519,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8501,10 +8568,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8521,12 +8596,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -9632,7 +9709,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9785,7 +9862,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9994,7 +10073,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10113,12 +10192,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10133,7 +10212,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10229,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
@@ -10564,6 +10650,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10680,7 +10767,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10703,13 +10790,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10717,23 +10804,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10786,6 +10873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10930,7 +11019,7 @@ static const struct address_space_operations btrfs_aops = {
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1743904202b..9d1eac15e09e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
free_extent_buffer(leaf);
leaf = NULL;
- new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
@@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4533,29 +4545,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (ret < 0)
goto out_acct;
- file_start_write(file);
-
if (iov_iter_count(&iter) == 0) {
ret = 0;
- goto out_end_write;
+ goto out_iov;
}
pos = args.offset;
ret = rw_verify_area(WRITE, file, &pos, args.len);
if (ret < 0)
- goto out_end_write;
+ goto out_iov;
init_sync_kiocb(&kiocb, file);
ret = kiocb_set_rw_flags(&kiocb, 0);
if (ret)
- goto out_end_write;
+ goto out_iov;
kiocb.ki_pos = pos;
+ file_start_write(file);
+
ret = btrfs_do_write_iter(&kiocb, &iter, &args);
if (ret > 0)
fsnotify_modify(file);
-out_end_write:
file_end_write(file);
+out_iov:
kfree(iov);
out_acct:
if (ret > 0)
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
*
* @cache: The cache.
* @max_size: Maximum size (number of entries) for the cache.
- * Use 0 for unlimited size, it's the user's responsability to
+ * Use 0 for unlimited size, it's the user's responsibility to
* trim the cache in that case.
*/
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index b8f9c9e56c8c..cdada4865837 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
* panic or BUGs, depending on mount options.
*/
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 4d04c1fa5899..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error);
__printf(5, 6)
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a82e1417c4d2..59850dc17b22 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
+ file_offset, len))
return false;
- btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
}
/* Now we're fine to update the accounting. */
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 567a6d3d4712..127ef8bf0ffd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -97,13 +97,6 @@ struct btrfs_ordered_extent {
u64 bytes_left;
/*
- * the end of the ordered extent which is behind it but
- * didn't update disk_i_size. Please see the comment of
- * btrfs_ordered_update_i_size();
- */
- u64 outstanding_isize;
-
- /*
* If we get truncated we need to adjust the file extent we enter for
* this ordered extent so that we do not expose stale data.
*/
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e46774e8f49f..5470e1cdf10c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
*
* Must be called with qgroup_lock held and @prealloc preallocated.
*
- * The control on the lifespan of @prealloc would be transfered to this
+ * The control on the lifespan of @prealloc would be transferred to this
* function, thus caller should no longer touch @prealloc.
*/
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -1736,6 +1736,15 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..792c8e17c31d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages);
+ rbio->stripe_pages + data_pages, 0);
if (ret < 0)
return ret;
@@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
@@ -1549,7 +1549,6 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct work_struct work;
};
/*
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..470213688872 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -164,7 +164,7 @@ struct raid56_bio_trace_info {
u8 stripe_nr;
};
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f88b0c2ac3fe..ae90894dc7dc 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
- btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5d9e5f74a52..abe594f77f99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* will re-read the whole page anyway.
*/
if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
round_up(i_size, PAGE_SIZE) - i_size);
unlock_page(page);
put_page(page);
@@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
@@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len);
goto release_page;
}
- btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, page_folio(page),
+ clamped_start, clamped_len);
/*
* Set the boundary if it's inside the page.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f62a408671cb..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,7 +43,7 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
#define SCRUB_STRIPES_PER_GROUP 8
@@ -192,7 +192,6 @@ struct scrub_ctx {
int cur_stripe;
atomic_t cancel_req;
int readonly;
- int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
if (ret < 0)
goto error;
@@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
/* Metadata, verify the full tree block. */
if (sector->is_metadata) {
/*
- * Check if the tree block crosses the stripe boudary. If
+ * Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
* warning.
*
@@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/*
* Init needed infos for error reporting.
*
- * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -1099,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1280,7 +1289,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset,
+ struct btrfs_chunk_map *map, u64 *offset,
u64 *stripe_start)
{
int i;
@@ -1409,14 +1418,11 @@ search_forward:
if (ret > 0)
break;
next:
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret) {
- /* Either no more item or fatal error */
- btrfs_release_path(path);
- return ret;
- }
+ ret = btrfs_next_item(extent_root, path);
+ if (ret) {
+ /* Either no more items or a fatal error. */
+ btrfs_release_path(path);
+ return ret;
}
}
btrfs_release_path(path);
@@ -1640,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1650,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1705,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1719,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
@@ -1816,7 +1834,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (sctx->is_dev_replace) {
/*
* For dev-replace, if we know there is something wrong with
- * metadata, we should immedately abort.
+ * metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1898,7 +1916,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
DECLARE_COMPLETION_ONSTACK(io_done);
@@ -2067,7 +2085,7 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2128,7 +2146,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2137,7 +2155,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
}
/* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
struct btrfs_block_group *bg,
int stripe_index)
{
@@ -2154,7 +2172,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
}
/* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2166,7 +2184,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
static int scrub_simple_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct btrfs_device *device,
int stripe_index)
{
@@ -2199,18 +2217,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct extent_map *em,
+ struct btrfs_chunk_map *map,
struct btrfs_device *scrub_dev,
int stripe_index)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
int ret2;
u64 physical = map->stripes[stripe_index].physical;
- const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
@@ -2373,17 +2390,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
int i;
int ret = 0;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, bg->start, bg->length);
- read_unlock(&map_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+ if (!map) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
@@ -2395,22 +2407,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- if (em->start != bg->start)
+ if (map->start != bg->start)
goto out;
- if (em->len < dev_extent_len)
+ if (map->chunk_len < dev_extent_len)
goto out;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
if (ret)
goto out;
}
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..e48a063ef085 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret)
goto out;
}
- if (sctx->cur_inode_last_extent <
- sctx->cur_inode_size) {
- ret = send_hole(sctx, sctx->cur_inode_size);
- if (ret)
+ if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ sctx->cur_inode_size);
+ if (ret < 0) {
goto out;
+ } else if (ret == 0) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* Range is already a hole, skip. */
+ ret = 0;
+ }
}
}
if (need_truncate) {
@@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -8205,8 +8214,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..3b54eb583474 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
@@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
+ used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
else
used += space_info->bytes_may_use - global_rsv_size;
@@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
- u64 global_rsv_size = global_rsv->reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
loops++;
@@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
- delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved +
- trans_rsv->reserved;
+ btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(trans_rsv);
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
@@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
- (delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved)) {
+ (btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv))) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
- } else if (delayed_block_rsv->reserved >
- delayed_refs_rsv->reserved) {
- to_reclaim = delayed_block_rsv->reserved;
+ } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+ btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+ to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
- to_reclaim = delayed_refs_rsv->reserved;
+ to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
flush = FLUSH_DELAYED_REFS_NR;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 1b999c6e4193..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,7 @@
* This means a slightly higher tree locking latency.
*/
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
@@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!page->mapping || !page->mapping->host ||
- is_data_inode(page->mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(mapping->host))
return true;
/*
@@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
}
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type)
+ struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
@@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
- if (page->mapping)
- ASSERT(PageLocked(page));
+ if (folio->mapping)
+ ASSERT(folio_test_locked(folio));
- /* Either not subpage, or the page already has private attached */
- if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
- attach_page_private(page, subpage);
+ folio_attach_private(folio, subpage);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- /* Either not subpage, or already detached */
- if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
return;
- subpage = detach_page_private(page);
+ subpage = folio_detach_private(folio);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
* This is important for eb allocation, to prevent race with last eb freeing
* of the same page.
* With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
* allocating the extent buffer.
*/
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
atomic_inc(&subpage->eb_refs);
}
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(atomic_read(&subpage->eb_refs));
atomic_dec(&subpage->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
+ /* For subpage support, the folio must be single page. */
+ ASSERT(folio_order(folio) == 0);
+
/* Basic checks */
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
- if (page->mapping)
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ if (folio->mapping)
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
}
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
atomic_add(nbits, &subpage->readers);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
bool is_data;
bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
- is_data = is_data_inode(page->mapping->host);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ is_data = is_data_inode(folio->mapping->host);
ASSERT(atomic_read(&subpage->readers) >= nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
@@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* As we want the atomic_sub_and_test() to be always executed.
*/
if (is_data && last)
- unlock_page(page);
+ folio_unlock(folio);
}
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
u32 orig_len = *len;
- *start = max_t(u64, page_offset(page), orig_start);
+ *start = max_t(u64, folio_pos(folio), orig_start);
/*
* For certain call sites like btrfs_drop_pages(), we may have pages
* beyond the target range. In that case, just set @len to 0, subpage
* helpers can handle @len == 0 without any problem.
*/
- if (page_offset(page) >= orig_start + orig_len)
+ if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
int ret;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
ASSERT(atomic_read(&subpage->readers) == 0);
ret = atomic_add_return(nbits, &subpage->writers);
@@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
}
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
/*
* We have call sites passing @lock_page into
@@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
}
/*
- * Lock a page for delalloc page writeback.
+ * Lock a folio for delalloc page writeback.
*
* Return -EAGAIN if the page is not properly initialized.
* Return 0 with the page locked, and writer counter updated.
@@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* it's really the correct page, as the caller is using
* filemap_get_folios_contig(), which can race with page invalidating.
*/
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
- lock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_lock(folio);
return 0;
}
- lock_page(page);
- if (!PagePrivate(page) || !page->private) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio_test_private(folio) || !folio_get_private(folio)) {
+ folio_unlock(folio);
return -EAGAIN;
}
- btrfs_subpage_clamp_range(page, &start, &len);
- btrfs_subpage_start_writer(fs_info, page, start, len);
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ btrfs_subpage_start_writer(fs_info, folio, start, len);
return 0;
}
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
- btrfs_subpage_clamp_range(page, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
- unlock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ folio_unlock(folio);
}
-#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
unsigned int start_bit; \
\
- btrfs_subpage_assert(fs_info, page, start, len); \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
start_bit += fs_info->subpage_info->name##_offset; \
start_bit; \
@@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
fs_info->subpage_info->bitmap_nr_bits)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
/*
@@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
@@ -455,101 +456,102 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
bool last;
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
if (last)
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- set_page_writeback(page);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
- ASSERT(PageWriteback(page));
- end_page_writeback(page);
+ ASSERT(folio_test_writeback(folio));
+ folio_end_writeback(folio);
}
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageOrdered(page);
+ folio_set_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
- ClearPageOrdered(page);
+ folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
- SetPageChecked(page);
+ folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,10 +561,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
*/
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+ struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
@@ -584,88 +586,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
* in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
* back to regular sectorsize branch.
*/
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
- test_page_func) \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \
+ folio_clear_func, folio_test_func) \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- btrfs_subpage_clamp_range(page, &start, &len); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
- PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
- PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
- PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
- PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+ folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+ folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+ folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+ folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+ folio_test_checked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!PageDirty(page));
- if (!btrfs_is_subpage(fs_info, page))
+ ASSERT(!folio_test_dirty(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
}
@@ -684,18 +692,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* extent_write_locked_range().
* In this case, we have to call subpage helper to handle the case.
*/
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len)
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- ASSERT(PageLocked(page));
+ ASSERT(folio_test_locked(folio));
/* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* For subpage case, there are two types of locked page. With or
@@ -704,12 +714,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
/* No writers, locked by plain lock_page() */
- return unlock_page(page);
+ folio_unlock(folio);
+ return;
+ }
/* Have writers, use proper subpage helper to end it */
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
@@ -717,7 +729,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
@@ -729,9 +741,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long checked_bitmap;
unsigned long flags;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_info);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
@@ -741,10 +753,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
- dump_page(page, "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
- start, len, page_offset(page),
+ start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..793c2b314a58 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -73,71 +73,68 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page);
+ struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
/*
* Template for subpage related operations.
*
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
*
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
*
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ef256b944c72..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -27,6 +27,7 @@
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include <linux/security.h>
+#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -64,19 +65,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
@@ -86,8 +75,22 @@ static void btrfs_put_super(struct super_block *sb)
close_ctree(fs_info);
}
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+ char *subvol_name;
+ u64 subvol_objectid;
+ u64 max_inline;
+ u32 commit_interval;
+ u32 metadata_ratio;
+ u32 thread_pool_size;
+ unsigned long mount_opt;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ refcount_t refs;
+};
+
enum {
- Opt_acl, Opt_noacl,
+ Opt_acl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
@@ -97,27 +100,26 @@ enum {
Opt_degraded,
Opt_device,
Opt_fatal_errors,
- Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_flushoncommit,
Opt_max_inline,
- Opt_barrier, Opt_nobarrier,
- Opt_datacow, Opt_nodatacow,
- Opt_datasum, Opt_nodatasum,
- Opt_defrag, Opt_nodefrag,
- Opt_discard, Opt_nodiscard,
+ Opt_barrier,
+ Opt_datacow,
+ Opt_datasum,
+ Opt_defrag,
+ Opt_discard,
Opt_discard_mode,
- Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
- Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache,
Opt_space_cache_version,
- Opt_ssd, Opt_nossd,
- Opt_ssd_spread, Opt_nossd_spread,
+ Opt_ssd,
+ Opt_ssd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
- Opt_treelog, Opt_notreelog,
+ Opt_treelog,
Opt_user_subvol_rm_allowed,
/* Rescue options */
@@ -128,14 +130,10 @@ enum {
Opt_ignoredatacsums,
Opt_rescue_all,
- /* Deprecated options */
- Opt_recovery,
- Opt_inode_cache, Opt_noinode_cache,
-
/* Debugging options */
- Opt_enospc_debug, Opt_noenospc_debug,
+ Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
- Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
@@ -143,785 +141,611 @@ enum {
Opt_err,
};
-static const match_table_t tokens = {
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_clear_cache, "clear_cache"},
- {Opt_commit_interval, "commit=%u"},
- {Opt_compress, "compress"},
- {Opt_compress_type, "compress=%s"},
- {Opt_compress_force, "compress-force"},
- {Opt_compress_force_type, "compress-force=%s"},
- {Opt_degraded, "degraded"},
- {Opt_device, "device=%s"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_flushoncommit, "flushoncommit"},
- {Opt_noflushoncommit, "noflushoncommit"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_datacow, "datacow"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datasum, "datasum"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_discard, "discard"},
- {Opt_discard_mode, "discard=%s"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_norecovery, "norecovery"},
- {Opt_ratio, "metadata_ratio=%u"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_skip_balance, "skip_balance"},
- {Opt_space_cache, "space_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_space_cache_version, "space_cache=%s"},
- {Opt_ssd, "ssd"},
- {Opt_nossd, "nossd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd_spread, "nossd_spread"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvol_empty, "subvol="},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_thread_pool, "thread_pool=%u"},
- {Opt_treelog, "treelog"},
- {Opt_notreelog, "notreelog"},
- {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+ Opt_fatal_errors_panic,
+ Opt_fatal_errors_bug,
+};
- /* Rescue options */
- {Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+ { "panic", Opt_fatal_errors_panic },
+ { "bug", Opt_fatal_errors_bug },
+ {}
+};
+
+enum {
+ Opt_discard_sync,
+ Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+ { "sync", Opt_discard_sync },
+ { "async", Opt_discard_async },
+ {}
+};
+
+enum {
+ Opt_space_cache_v1,
+ Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+ { "v1", Opt_space_cache_v1 },
+ { "v2", Opt_space_cache_v2 },
+ {}
+};
+
+enum {
+ Opt_rescue_usebackuproot,
+ Opt_rescue_nologreplay,
+ Opt_rescue_ignorebadroots,
+ Opt_rescue_ignoredatacsums,
+ Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+ { "usebackuproot", Opt_rescue_usebackuproot },
+ { "nologreplay", Opt_rescue_nologreplay },
+ { "ignorebadroots", Opt_rescue_ignorebadroots },
+ { "ibadroots", Opt_rescue_ignorebadroots },
+ { "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "idatacsums", Opt_rescue_ignoredatacsums },
+ { "all", Opt_rescue_parameter_all },
+ {}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+ Opt_fragment_parameter_data,
+ Opt_fragment_parameter_metadata,
+ Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+ { "data", Opt_fragment_parameter_data },
+ { "metadata", Opt_fragment_parameter_metadata },
+ { "all", Opt_fragment_parameter_all },
+ {}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("autodefrag", Opt_defrag),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("clear_cache", Opt_clear_cache),
+ fsparam_u32("commit", Opt_commit_interval),
+ fsparam_flag("compress", Opt_compress),
+ fsparam_string("compress", Opt_compress_type),
+ fsparam_flag("compress-force", Opt_compress_force),
+ fsparam_string("compress-force", Opt_compress_force_type),
+ fsparam_flag_no("datacow", Opt_datacow),
+ fsparam_flag_no("datasum", Opt_datasum),
+ fsparam_flag("degraded", Opt_degraded),
+ fsparam_string("device", Opt_device),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+ fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+ fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+ fsparam_string("max_inline", Opt_max_inline),
+ fsparam_u32("metadata_ratio", Opt_ratio),
+ fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+ fsparam_flag("skip_balance", Opt_skip_balance),
+ fsparam_flag_no("space_cache", Opt_space_cache),
+ fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+ fsparam_flag_no("ssd", Opt_ssd),
+ fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+ fsparam_string("subvol", Opt_subvol),
+ fsparam_flag("subvol=", Opt_subvol_empty),
+ fsparam_u64("subvolid", Opt_subvolid),
+ fsparam_u32("thread_pool", Opt_thread_pool),
+ fsparam_flag_no("treelog", Opt_treelog),
+ fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+ /* Rescue options. */
+ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
/* Deprecated, with alias rescue=nologreplay */
- {Opt_nologreplay, "nologreplay"},
+ __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
- {Opt_usebackuproot, "usebackuproot"},
+ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
- /* Deprecated options */
- {Opt_recovery, "recovery"},
-
- /* Debugging options */
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+ /* Debugging options. */
+ fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
- {Opt_fragment_data, "fragment=data"},
- {Opt_fragment_metadata, "fragment=metadata"},
- {Opt_fragment_all, "fragment=all"},
+ fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- {Opt_ref_verify, "ref_verify"},
+ fsparam_flag("ref_verify", Opt_ref_verify),
#endif
- {Opt_err, NULL},
+ {}
};
-static const match_table_t rescue_tokens = {
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_ignorebadroots, "ignorebadroots"},
- {Opt_ignorebadroots, "ibadroots"},
- {Opt_ignoredatacsums, "ignoredatacsums"},
- {Opt_ignoredatacsums, "idatacsums"},
- {Opt_rescue_all, "all"},
- {Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
- const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
- if (fs_info->mount_opt & opt) {
- btrfs_err(fs_info, "%s must be used with ro mount option",
- opt_name);
- return true;
- }
- return false;
+ return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *opts;
- char *orig;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int ret = 0;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&opts, ":")) != NULL) {
- int token;
+ switch (opt) {
+ case Opt_degraded:
+ btrfs_set_opt(ctx->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol_empty:
+ /*
+ * This exists because we used to allow it on accident, so we're
+ * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow
+ * empty subvol= again").
+ */
+ break;
+ case Opt_subvol:
+ kfree(ctx->subvol_name);
+ ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->subvol_name)
+ return -ENOMEM;
+ break;
+ case Opt_subvolid:
+ ctx->subvol_objectid = result.uint_64;
- if (!*p)
- continue;
- token = match_token(p, rescue_tokens, args);
- switch (token){
- case Opt_usebackuproot:
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
- break;
- case Opt_nologreplay:
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_ignorebadroots:
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- break;
- case Opt_ignoredatacsums:
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- break;
- case Opt_rescue_all:
- btrfs_info(info, "enabling all of the rescue options");
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_err:
- btrfs_info(info, "unrecognized rescue option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ /* subvolid=0 means give me the original fs_tree. */
+ if (!ctx->subvol_objectid)
+ ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+ break;
+ case Opt_device: {
+ struct btrfs_device *device;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(param->string, mode, false);
+ mutex_unlock(&uuid_mutex);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
+ break;
}
-out:
- kfree(orig);
- return ret;
-}
-
-/*
- * Regular mount options parser. Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p, *num;
- int intarg;
- int ret = 0;
- char *compress_type;
- bool compress_force = false;
- enum btrfs_compression_type saved_compress_type;
- int saved_compress_level;
- bool saved_compress_force;
- int no_compress = 0;
- const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
- btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (btrfs_free_space_cache_v1_active(info)) {
- if (btrfs_is_zoned(info)) {
- btrfs_info(info,
- "zoned: clearing existing space cache");
- btrfs_set_super_cache_generation(info->super_copy, 0);
+ case Opt_datasum:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
} else {
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
}
- }
-
- /*
- * Even the options are empty, we still need to do extra check
- * against new flags
- */
- if (!options)
- goto check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_degraded:
- btrfs_info(info, "allowing degraded mounts");
- btrfs_set_opt(info->mount_opt, DEGRADED);
- break;
- case Opt_subvol:
- case Opt_subvol_empty:
- case Opt_subvolid:
- case Opt_device:
- /*
- * These are parsed by btrfs_parse_subvol_options or
- * btrfs_parse_device_options and can be ignored here.
- */
- break;
- case Opt_nodatasum:
- btrfs_set_and_info(info, NODATASUM,
- "setting nodatasum");
- break;
- case Opt_datasum:
- if (btrfs_test_opt(info, NODATASUM)) {
- if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(info,
- "setting datasum, datacow enabled");
- else
- btrfs_info(info, "setting datasum");
- }
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_nodatacow:
- if (!btrfs_test_opt(info, NODATACOW)) {
- if (!btrfs_test_opt(info, COMPRESS) ||
- !btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(info,
- "setting nodatacow, compression disabled");
- } else {
- btrfs_info(info, "setting nodatacow");
- }
- }
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- btrfs_set_opt(info->mount_opt, NODATACOW);
- btrfs_set_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_datacow:
- btrfs_clear_and_info(info, NODATACOW,
- "setting datacow");
- break;
- case Opt_compress_force:
- case Opt_compress_force_type:
- compress_force = true;
- fallthrough;
- case Opt_compress:
- case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(info,
- COMPRESS) ?
- info->compress_type : BTRFS_COMPRESS_NONE;
- saved_compress_force =
- btrfs_test_opt(info, FORCE_COMPRESS);
- saved_compress_level = info->compress_level;
- if (token == Opt_compress ||
- token == Opt_compress_force ||
- strncmp(args[0].from, "zlib", 4) == 0) {
- compress_type = "zlib";
-
- info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- /*
- * args[0] contains uninitialized data since
- * for these tokens we don't expect any
- * parameter.
- */
- if (token != Opt_compress &&
- token != Opt_compress_force)
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZLIB,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- no_compress = 0;
- } else if (strncmp(args[0].from, "lzo", 3) == 0) {
- compress_type = "lzo";
- info->compress_type = BTRFS_COMPRESS_LZO;
- info->compress_level = 0;
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_LZO);
- no_compress = 0;
- } else if (strncmp(args[0].from, "zstd", 4) == 0) {
- compress_type = "zstd";
- info->compress_type = BTRFS_COMPRESS_ZSTD;
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZSTD,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
- no_compress = 0;
- } else if (strncmp(args[0].from, "no", 2) == 0) {
- compress_type = "no";
- info->compress_level = 0;
- info->compress_type = 0;
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- compress_force = false;
- no_compress++;
- } else {
- btrfs_err(info, "unrecognized compression value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
-
- if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- } else {
- /*
- * If we remount from compress-force=xxx to
- * compress=xxx, we need clear FORCE_COMPRESS
- * flag, otherwise, there is no way for users
- * to disable forcible compression separately.
- */
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- }
- if (no_compress == 1) {
- btrfs_info(info, "use no compression");
- } else if ((info->compress_type != saved_compress_type) ||
- (compress_force != saved_compress_force) ||
- (info->compress_level != saved_compress_level)) {
- btrfs_info(info, "%s %s compression, level %d",
- (compress_force) ? "force" : "use",
- compress_type, info->compress_level);
- }
- compress_force = false;
- break;
- case Opt_ssd:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_ssd_spread:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_set_and_info(info, SSD_SPREAD,
- "using spread ssd allocation scheme");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_nossd:
- btrfs_set_opt(info->mount_opt, NOSSD);
- btrfs_clear_and_info(info, SSD,
- "not using ssd optimizations");
- fallthrough;
- case Opt_nossd_spread:
- btrfs_clear_and_info(info, SSD_SPREAD,
- "not using spread ssd allocation scheme");
- break;
- case Opt_barrier:
- btrfs_clear_and_info(info, NOBARRIER,
- "turning on barriers");
- break;
- case Opt_nobarrier:
- btrfs_set_and_info(info, NOBARRIER,
- "turning off barriers");
- break;
- case Opt_thread_pool:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized thread_pool value %s",
- args[0].from);
- goto out;
- } else if (intarg == 0) {
- btrfs_err(info, "invalid value 0 for thread_pool");
- ret = -EINVAL;
- goto out;
- }
- info->thread_pool_size = intarg;
- break;
- case Opt_max_inline:
- num = match_strdup(&args[0]);
- if (num) {
- info->max_inline = memparse(num, NULL);
- kfree(num);
-
- if (info->max_inline) {
- info->max_inline = min_t(u64,
- info->max_inline,
- info->sectorsize);
- }
- btrfs_info(info, "max_inline at %llu",
- info->max_inline);
- } else {
- ret = -ENOMEM;
- goto out;
- }
- break;
- case Opt_acl:
+ break;
+ case Opt_datacow:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(ctx->mount_opt, NODATACOW);
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ }
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zlib", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "lzo", 3) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zstd", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "no", 2) == 0) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_ssd:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_ssd_spread:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ if (result.uint_32 == 0) {
+ btrfs_err(NULL, "invalid value 0 for thread_pool");
+ return -EINVAL;
+ }
+ ctx->thread_pool_size = result.uint_32;
+ break;
+ case Opt_max_inline:
+ ctx->max_inline = memparse(param->string, NULL);
+ break;
+ case Opt_acl:
+ if (result.negated) {
+ fc->sb_flags &= ~SB_POSIXACL;
+ } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- btrfs_err(info, "support for ACL not compiled in!");
- ret = -EINVAL;
- goto out;
+ btrfs_err(NULL, "support for ACL not compiled in");
+ return -EINVAL;
#endif
- case Opt_noacl:
- info->sb->s_flags &= ~SB_POSIXACL;
- break;
- case Opt_notreelog:
- btrfs_set_and_info(info, NOTREELOG,
- "disabling tree log");
- break;
- case Opt_treelog:
- btrfs_clear_and_info(info, NOTREELOG,
- "enabling tree log");
- break;
- case Opt_norecovery:
- case Opt_nologreplay:
- btrfs_warn(info,
+ }
+ /*
+ * VFS limits the ability to toggle ACL on and off via remount,
+ * despite every file system allowing this. This seems to be
+ * an oversight since we all do, but it'll fail if we're
+ * remounting. So don't set the mask here, we'll check it in
+ * btrfs_reconfigure and do the toggling ourselves.
+ */
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ fc->sb_flags_mask |= SB_POSIXACL;
+ break;
+ case Opt_treelog:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+ break;
+ case Opt_nologreplay:
+ btrfs_warn(NULL,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_flushoncommit:
- btrfs_set_and_info(info, FLUSHONCOMMIT,
- "turning on flush-on-commit");
- break;
- case Opt_noflushoncommit:
- btrfs_clear_and_info(info, FLUSHONCOMMIT,
- "turning off flush-on-commit");
- break;
- case Opt_ratio:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized metadata_ratio value %s",
- args[0].from);
- goto out;
- }
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %u",
- info->metadata_ratio);
- break;
- case Opt_discard:
- case Opt_discard_mode:
- if (token == Opt_discard ||
- strcmp(args[0].from, "sync") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
- btrfs_set_and_info(info, DISCARD_SYNC,
- "turning on sync discard");
- } else if (strcmp(args[0].from, "async") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
- btrfs_set_and_info(info, DISCARD_ASYNC,
- "turning on async discard");
- } else {
- btrfs_err(info, "unrecognized discard mode value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- btrfs_clear_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD_SYNC,
- "turning off discard");
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "turning off async discard");
- btrfs_set_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_space_cache:
- case Opt_space_cache_version:
- /*
- * We already set FREE_SPACE_TREE above because we have
- * compat_ro(FREE_SPACE_TREE) set, and we aren't going
- * to allow v1 to be set for extent tree v2, simply
- * ignore this setting if we're extent tree v2.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (token == Opt_space_cache ||
- strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(info->mount_opt,
- FREE_SPACE_TREE);
- btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
- } else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(info->mount_opt,
- SPACE_CACHE);
- btrfs_set_and_info(info, FREE_SPACE_TREE,
- "enabling free space tree");
- } else {
- btrfs_err(info, "unrecognized space_cache value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- break;
- case Opt_rescan_uuid_tree:
- btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
- break;
- case Opt_no_space_cache:
- /*
- * We cannot operate without the free space tree with
- * extent tree v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info, SPACE_CACHE,
- "disabling disk space caching");
- }
- if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info, FREE_SPACE_TREE,
- "disabling free space tree");
- }
- break;
- case Opt_inode_cache:
- case Opt_noinode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and has no effect since 5.11");
- break;
- case Opt_clear_cache:
- /*
- * We cannot clear the free space tree with extent tree
- * v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- btrfs_set_and_info(info, CLEAR_CACHE,
- "force clearing of disk cache");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_flushoncommit:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ else
+ btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ ctx->metadata_ratio = result.uint_32;
+ break;
+ case Opt_discard:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, NODISCARD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ }
+ break;
+ case Opt_discard_mode:
+ switch (result.uint_32) {
+ case Opt_discard_sync:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
break;
- case Opt_user_subvol_rm_allowed:
- btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ case Opt_discard_async:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
break;
- case Opt_enospc_debug:
- btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ default:
+ btrfs_err(NULL, "unrecognized discard mode value %s",
+ param->key);
+ return -EINVAL;
+ }
+ btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+ break;
+ case Opt_space_cache:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ }
+ break;
+ case Opt_space_cache_version:
+ switch (result.uint_32) {
+ case Opt_space_cache_v1:
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_noenospc_debug:
- btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ case Opt_space_cache_v2:
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_defrag:
- btrfs_set_and_info(info, AUTO_DEFRAG,
- "enabling auto defrag");
+ default:
+ btrfs_err(NULL, "unrecognized space_cache value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_clear_cache:
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ else
+ btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+ else
+ btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+ break;
+ case Opt_usebackuproot:
+ btrfs_warn(NULL,
+ "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+ /* If we're loading the backup roots we can't trust the space cache. */
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+ break;
+ case Opt_fatal_errors:
+ switch (result.uint_32) {
+ case Opt_fatal_errors_panic:
+ btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_nodefrag:
- btrfs_clear_and_info(info, AUTO_DEFRAG,
- "disabling auto defrag");
+ case Opt_fatal_errors_bug:
+ btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_recovery:
- case Opt_usebackuproot:
- btrfs_warn(info,
- "'%s' is deprecated, use 'rescue=usebackuproot' instead",
- token == Opt_recovery ? "recovery" :
- "usebackuproot");
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ default:
+ btrfs_err(NULL, "unrecognized fatal_errors value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_commit_interval:
+ ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval == 0)
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ break;
+ case Opt_rescue:
+ switch (result.uint_32) {
+ case Opt_rescue_usebackuproot:
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
break;
- case Opt_skip_balance:
- btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ case Opt_rescue_nologreplay:
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
- case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0) {
- btrfs_set_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else if (strcmp(args[0].from, "bug") == 0) {
- btrfs_clear_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else {
- btrfs_err(info, "unrecognized fatal_errors value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
+ case Opt_rescue_ignorebadroots:
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
break;
- case Opt_commit_interval:
- intarg = 0;
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized commit_interval value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- if (intarg == 0) {
- btrfs_info(info,
- "using default commit interval %us",
- BTRFS_DEFAULT_COMMIT_INTERVAL);
- intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
- } else if (intarg > 300) {
- btrfs_warn(info, "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
+ case Opt_rescue_ignoredatacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
- case Opt_rescue:
- ret = parse_rescue_options(info, args[0].from);
- if (ret < 0) {
- btrfs_err(info, "unrecognized rescue value %s",
- args[0].from);
- goto out;
- }
+ case Opt_rescue_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ default:
+ btrfs_info(NULL, "unrecognized rescue option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
- case Opt_fragment_all:
- btrfs_info(info, "fragmenting all space");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
- btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ case Opt_fragment:
+ switch (result.uint_32) {
+ case Opt_fragment_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_metadata:
- btrfs_info(info, "fragmenting metadata");
- btrfs_set_opt(info->mount_opt,
- FRAGMENT_METADATA);
+ case Opt_fragment_parameter_metadata:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_data:
- btrfs_info(info, "fragmenting data");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ case Opt_fragment_parameter_data:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
break;
+ default:
+ btrfs_info(NULL, "unrecognized fragment option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- case Opt_ref_verify:
- btrfs_info(info, "doing ref verification");
- btrfs_set_opt(info->mount_opt, REF_VERIFY);
- break;
+ case Opt_ref_verify:
+ btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+ break;
#endif
- case Opt_err:
- btrfs_err(info, "unrecognized mount option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ default:
+ btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+ return -EINVAL;
}
-check:
- /* We're read-only, don't have to check. */
- if (new_flags & SB_RDONLY)
- goto out;
- if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
- ret = -EINVAL;
-out:
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(info, "cannot disable free space tree");
- ret = -EINVAL;
- }
- if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
- ret = -EINVAL;
- }
- if (!ret)
- ret = btrfs_check_mountopts_zoned(info);
- if (!ret && !remounting) {
- if (btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
- }
- return ret;
+ return 0;
}
/*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
*/
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *device_name, *opts, *orig, *p;
- struct btrfs_device *device = NULL;
- int error = 0;
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
- lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(struct btrfs_fs_info *fs_info,
+ unsigned long mount_opt, unsigned long opt,
+ const char *opt_name)
+{
+ if (mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
- if (!options)
- return 0;
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags)
+{
+ bool ret = true;
- /*
- * strsep changes the string, duplicate it because btrfs_parse_options
- * gets called later
- */
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (!(flags & SB_RDONLY) &&
+ (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ ret = false;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free-space-tree");
+ ret = false;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+ ret = false;
+ }
- if (!*p)
- continue;
+ if (btrfs_check_mountopts_zoned(info, mount_opt))
+ ret = false;
- token = match_token(p, tokens, args);
- if (token == Opt_device) {
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- device = btrfs_scan_one_device(device_name, flags, false);
- kfree(device_name);
- if (IS_ERR(device)) {
- error = PTR_ERR(device);
- goto out;
- }
- }
+ if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+ btrfs_info(info, "using free-space-tree");
}
-out:
- kfree(orig);
- return error;
+ return ret;
}
/*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree(). We need to pre-load
+ * the mount options with the on-disk settings. Before the new mount API took
+ * effect we would do this on mount and remount. With the new mount API we'll
+ * only do this on the initial mount.
*
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options. If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
*/
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
- u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
- int error = 0;
- u64 subvolid;
-
- if (!options)
- return 0;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info,
+ "forcing free space tree for sector size %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ }
+ }
/*
- * strsep changes the string, duplicate it because
- * btrfs_parse_device_options gets called later
+ * At this point our mount options are populated, so we only mess with
+ * these settings if we don't have any settings already.
*/
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ return;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ if (btrfs_is_zoned(fs_info) &&
+ btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_info(fs_info, "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+ return;
+ }
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_subvol:
- kfree(*subvol_name);
- *subvol_name = match_strdup(&args[0]);
- if (!*subvol_name) {
- error = -ENOMEM;
- goto out;
- }
- break;
- case Opt_subvolid:
- error = match_u64(&args[0], &subvolid);
- if (error)
- goto out;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ return;
- /* we want the original fs_tree */
- if (subvolid == 0)
- subvolid = BTRFS_FS_TREE_OBJECTID;
+ if (btrfs_test_opt(fs_info, NOSPACECACHE))
+ return;
- *subvol_objectid = subvolid;
- break;
- default:
- break;
- }
- }
+ /*
+ * At this point we don't have explicit options set by the user, set
+ * them ourselves based on the state of the file system.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
-out:
- kfree(orig);
- return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating)
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+
+ /*
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
+ */
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1105,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
-#endif
- sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1291,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
return 0;
}
-static int btrfs_test_super(struct super_block *s, void *data)
-{
- struct btrfs_fs_info *p = data;
- struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
- return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
- int err = set_anon_super(s, data);
- if (!err)
- s->s_fs_info = data;
- return err;
-}
-
/*
* subvolumes are identified by ino 256
*/
@@ -1382,200 +1186,6 @@ out:
return root;
}
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
- int flags, const char *device_name, void *data)
-{
- struct block_device *bdev = NULL;
- struct super_block *s;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_fs_info *fs_info = NULL;
- void *new_sec_opts = NULL;
- blk_mode_t mode = sb_open_mode(flags);
- int error = 0;
-
- if (data) {
- error = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (error)
- return ERR_PTR(error);
- }
-
- /*
- * Setup a dummy root and fs_info for test/set super. This is because
- * we don't actually fill this stuff out until open_ctree, but we need
- * then open_ctree will properly initialize the file system specific
- * settings later. btrfs_init_fs_info initializes the static elements
- * of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
- */
- fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
- if (!fs_info) {
- error = -ENOMEM;
- goto error_sec_opts;
- }
- btrfs_init_fs_info(fs_info);
-
- fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- if (!fs_info->super_copy || !fs_info->super_for_commit) {
- error = -ENOMEM;
- goto error_fs_info;
- }
-
- mutex_lock(&uuid_mutex);
- error = btrfs_parse_device_options(data, mode);
- if (error) {
- mutex_unlock(&uuid_mutex);
- goto error_fs_info;
- }
-
- /*
- * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
- * either a valid device or an error.
- */
- device = btrfs_scan_one_device(device_name, mode, true);
- ASSERT(device != NULL);
- if (IS_ERR(device)) {
- mutex_unlock(&uuid_mutex);
- error = PTR_ERR(device);
- goto error_fs_info;
- }
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
- fs_info);
- if (IS_ERR(s)) {
- error = PTR_ERR(s);
- goto error_close_devices;
- }
-
- if (s->s_root) {
- btrfs_close_devices(fs_devices);
- btrfs_free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & SB_RDONLY)
- error = -EBUSY;
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
- s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
- error = btrfs_fill_super(s, fs_devices, data);
- }
- if (!error)
- error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
- security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- return dget(s->s_root);
-
-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
-error_sec_opts:
- security_free_mnt_opts(&new_sec_opts);
- return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/). This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- * 1. Parse subvol id related options for later use in mount_subvol().
- *
- * 2. Mount device's root (/) by calling vfs_kern_mount().
- *
- * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- * first place. In order to avoid calling btrfs_mount() again, we use
- * different file_system_type which is not registered to VFS by
- * register_filesystem() (btrfs_root_fs_type). As a result,
- * btrfs_mount_root() is called. The return value will be used by
- * mount_subtree() in mount_subvol().
- *
- * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
- * "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
-{
- struct vfsmount *mnt_root;
- struct dentry *root;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
- int error = 0;
-
- error = btrfs_parse_subvol_options(data, &subvol_name,
- &subvol_objectid);
- if (error) {
- kfree(subvol_name);
- return ERR_PTR(error);
- }
-
- /* mount device's root (/) */
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
- if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags & ~SB_RDONLY, device_name, data);
- } else {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags | SB_RDONLY, device_name, data);
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- down_write(&mnt_root->mnt_sb->s_umount);
- error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
- up_write(&mnt_root->mnt_sb->s_umount);
- if (error < 0) {
- root = ERR_PTR(error);
- mntput(mnt_root);
- kfree(subvol_name);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- /* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
- return root;
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
@@ -1638,202 +1248,282 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- unsigned old_flags = sb->s_flags;
- unsigned long old_opts = fs_info->mount_opt;
- unsigned long old_compress_type = fs_info->compress_type;
- u64 old_max_inline = fs_info->max_inline;
- u32 old_thread_pool_size = fs_info->thread_pool_size;
- u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- sync_filesystem(sb);
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "remounting read-write after error is not allowed");
+ return -EINVAL;
+ }
- if (data) {
- void *new_sec_opts = NULL;
+ if (fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
- ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (!ret)
- ret = security_sb_remount(sb, new_sec_opts);
- security_free_mnt_opts(&new_sec_opts);
- if (ret)
- goto restore;
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ return -EACCES;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ return -EINVAL;
}
- ret = btrfs_parse_options(fs_info, data, *flags);
+ /*
+ * NOTE: when remounting with a change that does writes, don't put it
+ * anywhere above this point, as we are not sure to be safe to write
+ * until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
- goto restore;
+ return ret;
- ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
- if (ret < 0)
- goto restore;
+ btrfs_clear_sb_rdonly(fs_info->sb);
- btrfs_remount_begin(fs_info, old_opts, *flags);
- btrfs_resize_thread_pool(fs_info,
- fs_info->thread_pool_size, old_thread_pool_size);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
- btrfs_warn(fs_info,
- "remount supports changing free space tree only from ro to rw");
- /* Make sure free space cache options match the state on disk */
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- if (btrfs_free_space_cache_v1_active(fs_info)) {
- btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- }
+ /*
+ * If we've gone from readonly -> read-write, we need to get our
+ * sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- goto out;
+ return 0;
+}
- if (*flags & SB_RDONLY) {
- /*
- * this also happens on 'umount -rf' or on shutdown, when
- * the filesystem is busy.
- */
- cancel_work_sync(&fs_info->async_reclaim_work);
- cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * This also happens on 'umount -rf' or on shutdown, when the
+ * filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
- btrfs_discard_cleanup(fs_info);
+ btrfs_discard_cleanup(fs_info);
- /* wait for the uuid_scan task to finish */
- down(&fs_info->uuid_tree_rescan_sem);
- /* avoid complains from lockdep et al. */
- up(&fs_info->uuid_tree_rescan_sem);
+ /* Wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* Avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
- btrfs_set_sb_rdonly(sb);
+ btrfs_set_sb_rdonly(fs_info->sb);
- /*
- * Setting SB_RDONLY will put the cleaner thread to
- * sleep at the next loop if it's already active.
- * If it's already asleep, we'll leave unused block
- * groups on disk until we're mounted read-write again
- * unless we clean them up here.
- */
- btrfs_delete_unused_bgs(fs_info);
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+ * loop if it's already active. If it's already asleep, we'll leave
+ * unused block groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
- /*
- * The cleaner task could be already running before we set the
- * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
- * We must make sure that after we finish the remount, i.e. after
- * we call btrfs_commit_super(), the cleaner can no longer start
- * a transaction - either because it was dropping a dead root,
- * running delayed iputs or deleting an unused block group (the
- * cleaner picked a block group from the list of unused block
- * groups before we were able to in the previous call to
- * btrfs_delete_unused_bgs()).
- */
- wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * The cleaner task could be already running before we set the flag
+ * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make
+ * sure that after we finish the remount, i.e. after we call
+ * btrfs_commit_super(), the cleaner can no longer start a transaction
+ * - either because it was dropping a dead root, running delayed iputs
+ * or deleting an unused block group (the cleaner picked a block
+ * group from the list of unused block groups before we were able to
+ * in the previous call to btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
- /*
- * We've set the superblock to RO mode, so we might have made
- * the cleaner task sleep without running all pending delayed
- * iputs. Go through all the delayed iputs here, so that if an
- * unmount happens without remounting RW we don't end up at
- * finishing close_ctree() with a non-empty list of delayed
- * iputs.
- */
- btrfs_run_delayed_iputs(fs_info);
+ /*
+ * We've set the superblock to RO mode, so we might have made the
+ * cleaner task sleep without running all pending delayed iputs. Go
+ * through all the delayed iputs here, so that if an unmount happens
+ * without remounting RW we don't end up at finishing close_ctree()
+ * with a non-empty list of delayed iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
- btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(fs_info);
- btrfs_pause_balance(fs_info);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
- /*
- * Pause the qgroup rescan worker if it is running. We don't want
- * it to be still running after we are in RO mode, as after that,
- * by the time we unmount, it might have left a transaction open,
- * so we would leak the transaction and/or crash.
- */
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want it to
+ * be still running after we are in RO mode, as after that, by the time
+ * we unmount, it might have left a transaction open, so we would leak
+ * the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
- ret = btrfs_commit_super(fs_info);
- if (ret)
- goto restore;
- } else {
- if (BTRFS_FS_ERROR(fs_info)) {
- btrfs_err(fs_info,
- "Remounting read-write after error is not allowed");
- ret = -EINVAL;
- goto restore;
- }
- if (fs_info->fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto restore;
- }
+ return btrfs_commit_super(fs_info);
+}
- if (!btrfs_check_rw_degradable(fs_info, NULL)) {
- btrfs_warn(fs_info,
- "too many missing devices, writable remount is not allowed");
- ret = -EACCES;
- goto restore;
- }
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ fs_info->max_inline = ctx->max_inline;
+ fs_info->commit_interval = ctx->commit_interval;
+ fs_info->metadata_ratio = ctx->metadata_ratio;
+ fs_info->thread_pool_size = ctx->thread_pool_size;
+ fs_info->mount_opt = ctx->mount_opt;
+ fs_info->compress_type = ctx->compress_type;
+ fs_info->compress_level = ctx->compress_level;
+}
- if (btrfs_super_log_root(fs_info->super_copy) != 0) {
- btrfs_warn(fs_info,
- "mount required to replay tree-log, cannot remount read-write");
- ret = -EINVAL;
- goto restore;
- }
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ ctx->max_inline = fs_info->max_inline;
+ ctx->commit_interval = fs_info->commit_interval;
+ ctx->metadata_ratio = fs_info->metadata_ratio;
+ ctx->thread_pool_size = fs_info->thread_pool_size;
+ ctx->mount_opt = fs_info->mount_opt;
+ ctx->compress_type = fs_info->compress_type;
+ ctx->compress_level = fs_info->compress_level;
+}
- /*
- * NOTE: when remounting with a change that does writes, don't
- * put it anywhere above this point, as we are not sure to be
- * safe to write until we pass the above checks.
- */
- ret = btrfs_start_pre_rw_mount(fs_info);
- if (ret)
- goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old)
+{
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+ btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+ btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+ btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+ btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+ btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+ btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+ btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+ btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+ btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+ btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+ btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+ btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+ btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+ btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+ btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+ btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+
+ btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+ btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+ btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+ btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+ btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+ btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+ btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+ /* Did the compression settings change? */
+ if (btrfs_test_opt(info, COMPRESS) &&
+ (!old ||
+ old->compress_type != info->compress_type ||
+ old->compress_level != info->compress_level ||
+ (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+ btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+ const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+ btrfs_info(info, "%s %s compression, level %d",
+ btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
- btrfs_clear_sb_rdonly(sb);
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_context old_ctx;
+ int ret = 0;
+ bool mount_reconfigure = (fc->s_fs_info != NULL);
- /*
- * If we've gone from readonly -> read/write, we need to get
- * our sync/async discard lists in the right state.
- */
- btrfs_discard_resume(fs_info);
+ btrfs_info_to_ctx(fs_info, &old_ctx);
+
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (!mount_reconfigure &&
+ !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ return -EINVAL;
+
+ ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+ if (ret < 0)
+ return ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+ btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+ old_ctx.thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from RO to RW");
+ /* Make sure free space cache options match the state on disk. */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
}
-out:
+
+ ret = 0;
+ if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_ro(fs_info);
+ else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_rw(fs_info);
+ if (ret)
+ goto restore;
+
/*
- * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
- * since the absence of the flag means it can be toggled off by remount.
+ * If we set the mask during the parameter parsing VFS would reject the
+ * remount. Here we can set the mask and the value will be updated
+ * appropriately.
*/
- *flags |= SB_I_VERSION;
+ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+ fc->sb_flags_mask |= SB_POSIXACL;
+ btrfs_emit_options(fs_info, &old_ctx);
wake_up_process(fs_info->transaction_kthread);
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
-
restore:
- /* We've hit an error - don't reset SB_RDONLY */
- if (sb_rdonly(sb))
- old_flags |= SB_RDONLY;
- if (!(old_flags & SB_RDONLY))
- clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
- sb->s_flags = old_flags;
- fs_info->mount_opt = old_opts;
- fs_info->compress_type = old_compress_type;
- fs_info->max_inline = old_max_inline;
- btrfs_resize_thread_pool(fs_info,
- old_thread_pool_size, fs_info->thread_pool_size);
- fs_info->metadata_ratio = old_metadata_ratio;
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_ctx_to_info(fs_info, &old_ctx);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
return ret;
}
@@ -2094,6 +1784,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct btrfs_fs_info *p = fc->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct block_device *bdev;
+ struct btrfs_device *device;
+ struct super_block *sb;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ int ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(fc->source, mode, true);
+ ASSERT(device != NULL);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(device);
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (ret)
+ return ret;
+
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto error;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+
+ /*
+ * From now on the error handling is not straightforward.
+ *
+ * If successful, this will transfer the fs_info into the super block,
+ * and fc->s_fs_info will be NULL. However if there's an existing
+ * super, we'll still have fc->s_fs_info populated. If we error
+ * completely out it'll be cleaned up when we drop the fs_context,
+ * otherwise it's tied to the lifetime of the super_block.
+ */
+ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto error;
+ }
+
+ set_device_specific_options(fs_info);
+
+ if (sb->s_root) {
+ btrfs_close_devices(fs_devices);
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
+ ret = -EBUSY;
+ } else {
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+ btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+ ret = btrfs_fill_super(sb, fs_devices, NULL);
+ }
+
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ btrfs_clear_oneshot_options(fs_info);
+
+ fc->root = dget(sb->s_root);
+ return 0;
+
+error:
+ btrfs_close_devices(fs_devices);
+ return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ * (iii) # create rw superblock + initial mount
+ * mount -t xfs /dev/sdb /opt/
+ *
+ * # create ro bind mount
+ * mount --bind -o ro /opt/foo /mnt/foo
+ *
+ * # unmount initial mount
+ * umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro. The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ * @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ * made the superblock ro. Note, how SB_RDONLY has the same value as
+ * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ * fsmount() or mount_setattr() this is a pure VFS level change for a
+ * specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ * in fc->sb_flags.
+ *
+ * This disambiguation has rather positive consequences. Mounting a subvolume
+ * ro will not also turn the superblock ro. Only the mount for the subvolume
+ * will become ro.
+ *
+ * So, if the superblock creation request comes from the new mount API the
+ * caller must have explicitly done:
+ *
+ * fsconfig(FSCONFIG_SET_FLAG, "ro")
+ * fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
+ *
+ * IOW, at some point the caller must have explicitly turned the whole
+ * superblock ro and we shouldn't just undo it like we did for the old mount
+ * API. In any case, it lets us avoid the hack in the new mount API.
+ *
+ * Consequently, the remounting hack must only be used for requests originating
+ * from the old mount API and should be marked for full deprecation so it can be
+ * turned off in a couple of years.
+ *
+ * The new mount API has no reason to support this hack.
+ */
+static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+{
+ struct vfsmount *mnt;
+ int ret;
+ const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
+
+ /*
+ * We got an EBUSY because our SB_RDONLY flag didn't match the existing
+ * super block, so invert our setting here and retry the mount so we
+ * can get our vfsmount.
+ */
+ if (ro2rw)
+ fc->sb_flags |= SB_RDONLY;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
+
+ mnt = fc_mount(fc);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ if (!fc->oldapi || !ro2rw)
+ return mnt;
+
+ /* We need to convert to rw, call reconfigure. */
+ fc->sb_flags &= ~SB_RDONLY;
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_reconfigure(fc);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret) {
+ mntput(mnt);
+ return ERR_PTR(ret);
+ }
+ return mnt;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_context *dup_fc;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ btrfs_free_fs_info(fs_info);
+ return -ENOMEM;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ dup_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(dup_fc)) {
+ btrfs_free_fs_info(fs_info);
+ return PTR_ERR(dup_fc);
+ }
+
+ /*
+ * When we do the sget_fc this gets transferred to the sb, so we only
+ * need to set it on the dup_fc as this is what creates the super block.
+ */
+ dup_fc->s_fs_info = fs_info;
+
+ /*
+ * We'll do the security settings in our btrfs_get_tree_super() mount
+ * loop, they were duplicated into dup_fc, we can drop the originals
+ * here.
+ */
+ security_free_mnt_opts(&fc->security);
+ fc->security = NULL;
+
+ mnt = fc_mount(dup_fc);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
+ mnt = btrfs_reconfigure_for_mount(dup_fc);
+ put_fs_context(dup_fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ /*
+ * This free's ->subvol_name, because if it isn't set we have to
+ * allocate a buffer to hold the subvol_name, so we just drop our
+ * reference to it here.
+ */
+ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+ ctx->subvol_name = NULL;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+static int btrfs_get_tree(struct fs_context *fc)
+{
+ /*
+ * Since we use mount_subtree to mount the default/specified subvol, we
+ * have to do mounts in two steps.
+ *
+ * First pass through we call btrfs_get_tree_subvol(), this is just a
+ * wrapper around fc_mount() to call back into here again, and this time
+ * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
+ * everything to open the devices and file system. Then we return back
+ * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+ * from there we can do our mount_subvol() call, which will lookup
+ * whichever subvol we're mounting and setup this fc with the
+ * appropriate dentry for the subvol.
+ */
+ if (fc->s_fs_info)
+ return btrfs_get_tree_super(fc);
+ return btrfs_get_tree_subvol(fc);
+}
+
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2101,22 +2094,85 @@ static void btrfs_kill_super(struct super_block *sb)
btrfs_free_fs_info(fs_info);
}
-static struct file_system_type btrfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+
+ if (fs_info)
+ btrfs_free_fs_info(fs_info);
+
+ if (ctx && refcount_dec_and_test(&ctx->refs)) {
+ kfree(ctx->subvol_name);
+ kfree(ctx);
+ }
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct btrfs_fs_context *ctx = src_fc->fs_private;
-static struct file_system_type btrfs_root_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount_root,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ /*
+ * Give a ref to our ctx to this dup, as we want to keep it around for
+ * our original fc so we can have the subvolume name or objectid.
+ *
+ * We unset ->source in the original fc because the dup needs it for
+ * mounting, and then once we free the dup it'll free ->source, so we
+ * need to make sure we're only pointing to it in one fc.
+ */
+ refcount_inc(&ctx->refs);
+ fc->fs_private = ctx;
+ fc->source = src_fc->source;
+ src_fc->source = NULL;
+ return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+ .parse_param = btrfs_parse_param,
+ .reconfigure = btrfs_reconfigure,
+ .get_tree = btrfs_get_tree,
+ .dup = btrfs_dup_fs_context,
+ .free = btrfs_free_fs_context,
};
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refs, 1);
+ fc->fs_private = ctx;
+ fc->ops = &btrfs_fs_context_ops;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+ } else {
+ ctx->thread_pool_size =
+ min_t(unsigned long, num_online_cpus() + 2, 8);
+ ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .init_fs_context = btrfs_init_fs_context,
+ .parameters = btrfs_fs_parameters,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2328,7 +2384,6 @@ static const struct super_operations btrfs_super_ops = {
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
- .remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
};
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..f18253ca280d 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,12 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e6b51fb3ddc1..84c05246ffd8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
unsigned long long limit;
limit = memparse(buf, &endptr);
+ /* There could be trailing '\n', also catch any typos after the value. */
+ endptr = skip_spaces(endptr);
+ if (*endptr != 0)
+ return -EINVAL;
WRITE_ONCE(device->scrub_speed_max, limit);
return len;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..709c6cc9706a 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+ [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0);
+ extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
TEST_ALLOC_INODE,
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
+ TEST_ALLOC_CHUNK_MAP,
};
extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1cc86af97dc6..25b3349595e0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < eb->len; i++) {
- struct page *page = eb->pages[i >> PAGE_SHIFT];
+ struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
void *addr = page_address(page) + offset_in_page(i);
if (memcmp(addr, memory + i, 1) != 0) {
@@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
- void *eb_addr = page_address(eb->pages[i]);
+ void *eb_addr = folio_address(eb->folios[i]);
if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
dump_eb_and_memory_contents(eb, memory, test_name);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 29bdd08b241f..253cce7ffecf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K)) {
+ if (!em) {
+ test_err("case1 [%llu %llu]: no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
@@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
test_err("case2 [0 1K]: ret %d", ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ if (!em) {
+ test_err("case2 [0 1K]: no extent map returned");
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
@@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case3 [0x%llx 0x%llx): ret %d",
+ test_err("case3 [%llu %llu): ret %d",
start, start + len, ret);
goto out;
}
+ if (!em) {
+ test_err("case3 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (em &&
- (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len)) {
+ if (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len) {
test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
ret = -EINVAL;
@@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case4 [0x%llx 0x%llx): ret %d",
- start, len, ret);
+ test_err("case4 [%llu %llu): ret %d",
+ start, start + len, ret);
goto out;
}
- if (em && (start < em->start || start + len > extent_map_end(em))) {
+ if (!em) {
+ test_err("case4 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
- start, len, ret, em->start, em->len, em->block_start,
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
+ start, start + len, ret, em->start, em->len, em->block_start,
em->block_len);
ret = -EINVAL;
}
@@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
u64 start, u64 len, u64 block_start)
{
struct extent_map *em;
@@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
em->len = len;
em->block_start = block_start;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct inode *inode;
@@ -585,35 +606,35 @@ static int test_case_5(void)
em_tree = &BTRFS_I(inode)->extent_tree;
/* [0, 12k) */
- ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
struct extent_map *em = NULL;
int ret;
- ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
@@ -713,7 +734,7 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -742,9 +763,9 @@ static int test_case_7(void)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -765,7 +786,7 @@ static int test_case_7(void)
em->block_start = SZ_32K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -859,33 +880,21 @@ struct rmap_test_vector {
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
- struct extent_map *em;
- struct map_lookup *map = NULL;
+ struct btrfs_chunk_map *map;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
- em = alloc_extent_map();
- if (!em) {
- test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
- }
-
- map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
if (!map) {
- kfree(em);
- test_std_err(TEST_ALLOC_EXTENT_MAP);
+ test_std_err(TEST_ALLOC_CHUNK_MAP);
return -ENOMEM;
}
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
- em->start = SZ_4G;
- em->len = test->data_stripe_size * test->num_data_stripes;
- em->block_len = em->len;
- em->orig_block_len = test->data_stripe_size;
- em->map_lookup = map;
-
+ map->start = SZ_4G;
+ map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+ map->stripe_size = test->data_stripe_size;
map->num_stripes = test->num_stripes;
map->type = test->raid_type;
@@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
- write_lock(&fs_info->mapping_tree.lock);
- ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
- write_unlock(&fs_info->mapping_tree.lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
- test_err("error adding block group mapping to mapping tree");
+ test_err("error adding chunk map to mapping tree");
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
- write_lock(&fs_info->mapping_tree.lock);
- remove_extent_mapping(&fs_info->mapping_tree, em);
- write_unlock(&fs_info->mapping_tree.lock);
- /* For us */
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
out_free:
- /* For the tree */
- free_extent_map(em);
kfree(logical);
return ret;
}
@@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void)
ret = test_case_4(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_5();
+ ret = test_case_5(fs_info);
if (ret)
goto out;
ret = test_case_6(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_7();
+ ret = test_case_7(fs_info);
if (ret)
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..9957de9f7806 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
/*
@@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
disk_bytenr = em->block_start;
@@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
vacancy_only, em->flags);
goto out;
}
@@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ test_err("wrong flags, wanted %u, have %u", vacancy_only,
em->flags);
goto out;
}
@@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, wanted 0 got %lu",
+ test_err("unexpected flags set, wanted 0 got %u",
em->flags);
goto out;
}
@@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
test_msg("running inode tests");
- set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+ compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+ prealloc_only |= EXTENT_FLAG_PREALLOC;
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef04..bf8e64c766b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
- u64 extra_delayed_refs_bytes = 0;
- u64 bytes;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
- * If there's a gap between the size of the delayed refs reserve and
- * its reserved space, than some tasks have added delayed refs or bumped
- * its size otherwise (due to block group creation or removal, or block
- * group item update). Also try to allocate that gap in order to prevent
- * using (and possibly abusing) the global reserve when committing the
- * transaction.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
- extra_delayed_refs_bytes = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- }
-
- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
- /*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0) {
- if (extra_delayed_refs_bytes > 0)
- btrfs_migrate_to_delayed_refs_rsv(fs_info,
- extra_delayed_refs_bytes);
- return 0;
- }
-
- if (extra_delayed_refs_bytes > 0) {
- bytes -= extra_delayed_refs_bytes;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0)
- return 0;
- }
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..14b9fbe82da4 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -22,7 +22,7 @@ struct btrfs_tree_parent_check {
/*
* Expected transid, can be 0 to skip the check, but such skip
- * should only be utlized for backref walk related code.
+ * should only be utilized for backref walk related code.
*/
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d6729d9fd2f..331fc7429952 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
- btrfs_redirty_list_add(trans->transaction, eb);
} else {
unaccount_log_buffer(eb->fs_info, eb->start);
}
@@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
@@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
@@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
struct btrfs_key key;
+ enum btrfs_compression_type compress_type;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
btrfs_set_stack_file_extent_generation(&fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ compress_type = extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_offset(&fi, extent_offset);
btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
- btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
@@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f627674b37db..d67785be2c77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -41,6 +41,17 @@
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -1742,19 +1753,18 @@ out:
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
* Find the mapping containing the given logical extent.
*
@@ -2994,38 +3079,35 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
+ if (unlikely(!map)) {
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len <= logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
- logical, logical + length, em->start, em->start + em->len);
- free_extent_map(em);
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3050,23 +3132,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- return PTR_ERR(em);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3169,7 +3249,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3188,7 +3268,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, map);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3200,7 +3280,7 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5347,24 +5427,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ p = &fs_info->mapping_tree.rb_root.rb_node;
+ while (*p) {
+ struct btrfs_chunk_map *entry;
+
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+ if (map->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (map->start > entry->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&map->rb_node, parent, p);
+ rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp)
+{
+ const int size = btrfs_chunk_map_size(map->num_stripes);
+ struct btrfs_chunk_map *clone;
+
+ clone = kmemdup(map, size, gfp);
+ if (!clone)
+ return NULL;
+
+ refcount_set(&clone->refs, 1);
+ RB_CLEAR_NODE(&clone->rb_node);
+
+ return clone;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
int i;
int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5375,41 +5562,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
j * ctl->stripe_size;
}
}
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5422,23 +5590,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5514,8 +5669,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5544,14 +5698,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
@@ -5608,7 +5761,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5653,7 +5806,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5662,17 +5815,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5693,38 +5844,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
enum btrfs_raid_types index;
int ret = 1;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5733,7 +5883,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
index = btrfs_bg_flags_to_raid_index(map->type);
/* Non-RAID56, use their ncopies from btrfs_raid_array. */
@@ -5750,53 +5899,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
if (!btrfs_fs_incompat(fs_info, RAID56))
return len;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
+ struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
int i;
@@ -5903,8 +6048,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
@@ -5922,11 +6066,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int ret;
int i;
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -5934,8 +6076,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
/*
@@ -6032,10 +6174,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
}
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return stripes;
out_free_map:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
@@ -6133,17 +6275,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u32 *stripe_nr, u64 *stripe_offset,
- u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
- *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(*stripe_offset < U32_MAX);
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6158,18 +6299,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* to go rounddown(), not round_down(), as nr_data_stripes is
* not ensured to be power of 2.
*/
- *full_stripe_start =
- btrfs_stripe_nr_to_offset(
- rounddown(*stripe_nr, nr_data_stripes(map)));
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(*full_stripe_start + full_stripe_len > offset);
- ASSERT(*full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
*/
- if (op == BTRFS_MAP_WRITE)
- return full_stripe_len - (offset - *full_stripe_start);
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
}
/*
@@ -6177,26 +6317,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return BTRFS_STRIPE_LEN - *stripe_offset;
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
return U64_MAX;
}
-static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length, struct btrfs_io_stripe *dst,
- struct map_lookup *map, u32 stripe_index,
- u64 stripe_offset, u64 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- dst->dev = map->stripes[stripe_index].dev;
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
return btrfs_get_raid_extent_offset(fs_info, logical, length,
- map->type, stripe_index, dst);
+ map->type,
+ io_geom->stripe_index, dst);
- dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
return 0;
}
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ enum btrfs_map_op op, int mirror_num)
+{
+ if (!smap)
+ return false;
+
+ if (num_alloc_stripes != 1)
+ return false;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ return false;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ return false;
+
+ return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
+
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
+
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
+ }
+
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ /*
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
+ */
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
/*
* Map one logical range to one or more physical ranges.
*
@@ -6237,43 +6531,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- u64 stripe_offset;
- u32 stripe_nr;
- u32 stripe_index;
- int data_stripes;
int i;
int ret = 0;
- int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
- int num_stripes;
int num_copies;
- int max_errors = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
u16 num_alloc_stripes;
- u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
+
num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (mirror_num > num_copies)
+ if (io_geom.mirror_num > num_copies)
return -EINVAL;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- map = em->map_lookup;
- data_stripes = nr_data_stripes(map);
-
- map_offset = logical - em->start;
- max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
- &stripe_offset, &raid56_full_stripe_start);
- *length = min_t(u64, em->len - map_offset, max_len);
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6284,107 +6572,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- if (op == BTRFS_MAP_READ)
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_index = (stripe_nr % factor) * map->sub_stripes;
- stripe_nr /= factor;
-
- if (op != BTRFS_MAP_READ)
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (op != BTRFS_MAP_READ || mirror_num > 1) {
- /*
- * Needs full stripe mapping.
- *
- * Push stripe_nr back to the start of the full stripe
- * For those cases needing a full stripe, @stripe_nr
- * is the full stripe number.
- *
- * Originally we go raid56_full_stripe_start / full_stripe_len,
- * but that can be expensive. Here we just divide
- * @stripe_nr with @data_stripes.
- */
- stripe_nr /= data_stripes;
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = btrfs_chunk_max_errors(map);
-
- /* Return the length to the full stripe end */
- *length = min(logical + *length,
- raid56_full_stripe_start + em->start +
- btrfs_stripe_nr_to_offset(data_stripes)) -
- logical;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- ASSERT(mirror_num <= 1);
- /* Just grab the data stripe directly. */
- stripe_index = stripe_nr % data_stripes;
- stripe_nr /= data_stripes;
-
- /* We distribute the parity blocks across stripes */
- stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num < 1)
- mirror_num = 1;
- }
- } else {
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
/*
* After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- mirror_num = stripe_index + 1;
+ map_blocks_single(map, &io_geom);
+ break;
}
- if (stripe_index >= map->num_stripes) {
+ if (io_geom.stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
+ io_geom.stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
- num_alloc_stripes = num_stripes;
+ num_alloc_stripes = io_geom.num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ)
/*
@@ -6401,14 +6628,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (smap && num_alloc_stripes == 1 &&
- !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
- op != BTRFS_MAP_READ) &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- ret = set_io_stripe(fs_info, op, logical, length, smap, map,
- stripe_index, stripe_offset, stripe_nr);
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+ io_geom.mirror_num)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
- *mirror_num_ret = mirror_num;
+ *mirror_num_ret = io_geom.mirror_num;
*bioc_ret = NULL;
goto out;
}
@@ -6428,7 +6652,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* It's still mostly the same as other profiles, just with extra rotation.
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6437,28 +6661,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* In this case, we just add @stripe_nr with @i, then do the
* modulo, to reduce one modulo call.
*/
- bioc->full_stripe_logical = em->start +
- btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (int i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
- if (ret < 0)
- break;
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
}
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ for (i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
if (ret < 0)
break;
- stripe_index++;
+ io_geom.stripe_index++;
}
}
@@ -6469,18 +6696,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
if (op != BTRFS_MAP_READ)
- max_errors = btrfs_chunk_max_errors(map);
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ &io_geom.num_stripes, &io_geom.max_errors);
}
*bioc_ret = bioc;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
out:
if (dev_replace_is_ongoing) {
@@ -6488,7 +6715,7 @@ out:
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -6660,12 +6887,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const struct map_lookup *map = em->map_lookup;
const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(em->len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -6734,9 +6960,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
@@ -6770,35 +6994,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6813,7 +7024,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = btrfs_calc_stripe_length(em);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6829,7 +7040,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
ret = PTR_ERR(map->stripes[i].dev);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
}
@@ -6838,15 +7049,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
&(map->stripes[i].dev->dev_state));
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
}
- free_extent_map(em);
return ret;
}
@@ -7156,26 +7364,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7193,18 +7396,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7697,20 +7897,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (!map) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7718,12 +7913,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = btrfs_calc_stripe_length(em);
+ stripe_len = btrfs_calc_stripe_length(map);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
@@ -7746,7 +7940,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -7792,32 +7986,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (map->num_stripes != map->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cc374864a79..53f87f398da7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -426,7 +426,8 @@ struct btrfs_discard_stripe {
struct btrfs_io_context {
refcount_t refs;
struct btrfs_fs_info *fs_info;
- u64 map_type; /* get from map_lookup->type */
+ /* Taken from struct btrfs_chunk_map::type. */
+ u64 map_type;
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
@@ -529,18 +530,32 @@ struct btrfs_raid_attr {
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-struct map_lookup {
+struct btrfs_chunk_map {
+ struct rb_node rb_node;
+ /* For mount time dev extent verification. */
+ int verified_stripes;
+ refcount_t refs;
+ u64 start;
+ u64 chunk_len;
+ u64 stripe_size;
u64 type;
int io_align;
int io_width;
int num_stripes;
int sub_stripes;
- int verified_stripes; /* For mount time dev extent verification */
struct btrfs_io_stripe stripes[];
};
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+ if (map && refcount_dec_and_test(&map->refs)) {
+ ASSERT(RB_EMPTY_NODE(&map->rb_node));
+ kfree(map);
+ }
+}
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
*
* @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
* than 4G. This does the proper type cast to avoid overflow.
@@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
@@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp);
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3cf236fb40a4..6287763fdccc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name, void *buffer,
+ size_t size)
+{
+ int ret;
+ bool is_cap = false;
+
+ name = xattr_full_name(handler, name);
+
+ /*
+ * security.capability doesn't cache the results, so calls into us
+ * constantly to see if there's a capability xattr. Cache the result
+ * here in order to avoid wasting time doing lookups for xattrs we know
+ * don't exist.
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ is_cap = true;
+ if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+ return -ENODATA;
+ }
+
+ ret = btrfs_getxattr(inode, name, buffer, size);
+ if (ret == -ENODATA && is_cap)
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+ return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
+ const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = btrfs_xattr_handler_get,
- .set = btrfs_xattr_handler_set,
+ .get = btrfs_xattr_handler_get_security,
+ .set = btrfs_xattr_handler_set_security,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
err = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 6c231a116a29..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -354,18 +354,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 188378ca19c7..5f750fa53a2b 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
kvfree(zones);
- switch (bdev_zoned_model(bdev)) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(bdev)) {
model = "host-managed zoned";
emulated = "";
- break;
- case BLK_ZONED_HA:
- model = "host-aware zoned";
- emulated = "";
- break;
- case BLK_ZONED_NONE:
+ } else {
model = "regular";
emulated = "emulated ";
- break;
- default:
- /* Just in case */
- btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
- bdev_zoned_model(bdev),
- rcu_str_deref(device->name));
- ret = -EOPNOTSUPP;
- goto out_free_zone_info;
}
btrfs_info_in_rcu(fs_info,
@@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
out:
kvfree(zones);
-out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
-
return ret;
}
@@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- if (device->bdev &&
- bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ if (device->bdev && bdev_is_zoned(device->bdev)) {
btrfs_err(fs_info,
"zoned: mode not enabled but zoned device found: %pg",
device->bdev);
@@ -781,7 +764,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* Check mount options here, because we might change fs_info->zoned
* from fs_info->zone_size.
*/
- ret = btrfs_check_mountopts_zoned(fs_info);
+ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
if (ret)
return ret;
@@ -789,7 +772,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -798,18 +781,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
* Space cache writing is not COWed. Disable that to avoid write errors
* in sequential zones.
*/
- if (btrfs_test_opt(info, SPACE_CACHE)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_err(info, "zoned: space cache v1 is not supported");
return -EINVAL;
}
- if (btrfs_test_opt(info, NODATACOW)) {
+ if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
btrfs_err(info, "zoned: NODATACOW not supported");
return -EINVAL;
}
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "zoned: async discard ignored and disabled for zoned mode");
+ if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+ btrfs_info(info,
+ "zoned: async discard ignored and disabled for zoned mode");
+ btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+ }
return 0;
}
@@ -1290,7 +1276,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct map_lookup *map)
+ struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device = map->stripes[zone_idx].dev;
@@ -1393,7 +1379,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1435,7 +1421,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1483,7 +1469,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1515,7 +1501,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1552,9 +1538,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
struct zone_info *zone_info = NULL;
@@ -1575,17 +1559,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EIO;
}
- /* Get the chunk mapping */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
-
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, logical, length);
+ if (!map)
return -EINVAL;
- map = em->map_lookup;
-
- cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS);
if (!cache->physical_map) {
ret = -ENOMEM;
goto out;
@@ -1661,6 +1639,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1687,12 +1674,12 @@ out:
spin_unlock(&fs_info->zone_active_bgs_lock);
}
} else {
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
cache->physical_map = NULL;
}
bitmap_free(active);
kfree(zone_info);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -1715,22 +1702,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
cache->zone_unusable = unusable;
}
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb)
-{
- if (!btrfs_is_zoned(eb->fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
- return;
-
- ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
- memzero_extent_buffer(eb, 0, eb->len);
- set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
- set_extent_buffer_dirty(eb);
- set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY, NULL);
-}
-
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -2082,7 +2053,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *device;
u64 physical;
const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2094,6 +2065,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2106,7 +2078,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2126,20 +2097,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2147,8 +2115,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2156,6 +2122,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
@@ -2194,7 +2161,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
const bool is_metadata = (block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
int ret = 0;
@@ -2643,7 +2610,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
/* Release reservation for currently active block groups. */
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- struct map_lookup *map = block_group->physical_map;
+ struct btrfs_chunk_map *map = block_group->physical_map;
if (!(block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..f573bda496fb 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+ unsigned long *mount_opt)
{
return 0;
}
@@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info(
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb) { }
-
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
@@ -323,8 +319,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
- /* Do not allow Host Manged zoned device */
- return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+ /* Do not allow Host Managed zoned device. */
+ return !bdev_is_zoned(bdev);
}
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5511766485cd..0d66db8bc1d4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
-
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;