diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 170 |
1 files changed, 117 insertions, 53 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0d16ed19e84b..38ac638665a6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1537,8 +1537,8 @@ out: } /** - * find_first_clear_extent_bit - finds the first range that has @bits not set - * and that starts after @start + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. * * @tree - the tree to search * @start - the offset at/after which the found extent should start @@ -1562,28 +1562,72 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node) { + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { node = next; - if (!node) { + } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ + state = rb_entry(node, struct extent_state, rb_node); + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { /* - * We are past the last allocated chunk, - * set start at the end of the last extent. The - * device alloc tree should never be empty so - * prev is always set. + * |--range with bits sets--| + * | + * start */ - ASSERT(prev); - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; } - } - state = rb_entry(node, struct extent_state, rb_node); - if (in_range(start, state->start, state->end - state->start + 1) && - (state->state & bits)) { - start = state->end + 1; } else { - *start_ret = start; + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } break; } } @@ -1838,7 +1882,7 @@ static int __process_pages_contig(struct address_space *mapping, if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); - if (pages[i] == locked_page) { + if (locked_page && pages[i] == locked_page) { put_page(pages[i]); pages_locked++; continue; @@ -1858,7 +1902,8 @@ static int __process_pages_contig(struct address_space *mapping, if (!PageDirty(pages[i]) || pages[i]->mapping != mapping) { unlock_page(pages[i]); - put_page(pages[i]); + for (; i < ret; i++) + put_page(pages[i]); err = -EAGAIN; goto out; } @@ -3901,6 +3946,11 @@ int btree_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -3918,7 +3968,6 @@ retry: tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4043,6 +4092,11 @@ static int extent_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -4076,11 +4130,10 @@ retry: &index, end, tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - done_index = page->index; + done_index = page->index + 1; /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -4115,16 +4168,6 @@ retry: ret = __extent_writepage(page, wbc, epd); if (ret < 0) { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done_index = page->index + 1; done = 1; break; } @@ -4146,7 +4189,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -4397,6 +4449,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) /* once for us */ free_extent_map(em); + + cond_resched(); /* Allow large-extent preemption. */ } } return try_release_extent_state(tree, page, mask); @@ -4930,25 +4984,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page + /* + * The TREE_REF bit is first set when the extent_buffer is added + * to the radix tree. It is also reset, if unset, when a new reference + * is created by find_extent_buffer. * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. + * It is only cleared in two cases: freeing the last non-tree + * reference to the extent_buffer when its STALE bit is set or + * calling releasepage when the tree reference is the only reference. * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. + * In both cases, care is taken to ensure that the extent_buffer's + * pages are not under io. However, releasepage can be concurrently + * called with creating new references, which is prone to race + * conditions between the calls to check_buffer_tree_ref in those + * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. + * The actual lifetime of the extent_buffer in the radix tree is + * adequately protected by the refcount, but the TREE_REF bit and + * its corresponding reference are not. To protect against this + * class of races, we call check_buffer_tree_ref from the codepaths + * which trigger io after they set eb->io_pages. Note that once io is + * initiated, TREE_REF can no longer be cleared, so that is the + * moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5025,12 +5082,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); @@ -5396,6 +5455,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); + /* + * It is possible for releasepage to clear the TREE_REF bit before we + * set io_pages. See check_buffer_tree_ref for a more detailed comment. + */ + check_buffer_tree_ref(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; |