aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c170
1 files changed, 117 insertions, 53 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0d16ed19e84b..38ac638665a6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1537,8 +1537,8 @@ out:
}
/**
- * find_first_clear_extent_bit - finds the first range that has @bits not set
- * and that starts after @start
+ * find_first_clear_extent_bit - find the first range that has @bits not set.
+ * This range could start before @start.
*
* @tree - the tree to search
* @start - the offset at/after which the found extent should start
@@ -1562,28 +1562,72 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
/* Find first extent with bits cleared */
while (1) {
node = __etree_search(tree, start, &next, &prev, NULL, NULL);
- if (!node) {
+ if (!node && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!node && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ state = rb_entry(prev, struct extent_state, rb_node);
+ *start_ret = state->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!node) {
node = next;
- if (!node) {
+ }
+ /*
+ * At this point 'node' either contains 'start' or start is
+ * before 'node'
+ */
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
/*
- * We are past the last allocated chunk,
- * set start at the end of the last extent. The
- * device alloc tree should never be empty so
- * prev is always set.
+ * |--range with bits sets--|
+ * |
+ * start
*/
- ASSERT(prev);
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as
+ * the beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
}
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (in_range(start, state->start, state->end - state->start + 1) &&
- (state->state & bits)) {
- start = state->end + 1;
} else {
- *start_ret = start;
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev) {
+ state = rb_entry(prev, struct extent_state,
+ rb_node);
+ *start_ret = state->end + 1;
+ } else {
+ *start_ret = 0;
+ }
break;
}
}
@@ -1838,7 +1882,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
- if (pages[i] == locked_page) {
+ if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
pages_locked++;
continue;
@@ -1858,7 +1902,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -3901,6 +3946,11 @@ int btree_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -3918,7 +3968,6 @@ retry:
tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -4043,6 +4092,11 @@ static int extent_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -4076,11 +4130,10 @@ retry:
&index, end, tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- done_index = page->index;
+ done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -4115,16 +4168,6 @@ retry:
ret = __extent_writepage(page, wbc, epd);
if (ret < 0) {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
done = 1;
break;
}
@@ -4146,7 +4189,16 @@ retry:
*/
scanned = 1;
index = 0;
- goto retry;
+
+ /*
+ * If we're looping we could run into a page that is locked by a
+ * writer and that writer could be waiting on writeback for a
+ * page in our current bio, and thus deadlock, so flush the
+ * write bio here.
+ */
+ ret = flush_write_bio(epd);
+ if (!ret)
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -4397,6 +4449,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
@@ -4930,25 +4984,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5025,12 +5082,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
@@ -5396,6 +5455,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];