diff options
Diffstat (limited to 'block/bio.c')
-rw-r--r-- | block/bio.c | 331 |
1 files changed, 199 insertions, 132 deletions
diff --git a/block/bio.c b/block/bio.c index 4db1008309ed..67bba12d273b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * */ #include <linux/mm.h> #include <linux/swap.h> @@ -647,26 +634,68 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); +static inline bool page_is_mergeable(const struct bio_vec *bv, + struct page *page, unsigned int len, unsigned int off, + bool *same_page) +{ + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + + bv->bv_offset + bv->bv_len - 1; + phys_addr_t page_addr = page_to_phys(page); + + if (vec_end_addr + 1 != page_addr + off) + return false; + if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) + return false; + + *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) + return false; + return true; +} + +/* + * Check if the @page can be added to the current segment(@bv), and make + * sure to call it only if page_is_mergeable(@bv, @page) is true + */ +static bool can_add_page_to_seg(struct request_queue *q, + struct bio_vec *bv, struct page *page, unsigned len, + unsigned offset) +{ + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; + + if ((addr1 | mask) != (addr2 | mask)) + return false; + + if (bv->bv_len + len > queue_max_segment_size(q)) + return false; + + return true; +} + /** - * bio_add_pc_page - attempt to add page to bio + * __bio_add_pc_page - attempt to add page to passthrough bio * @q: the target queue * @bio: destination bio * @page: page to add * @len: vec entry length * @offset: vec entry offset + * @put_same_page: put the page if it is same with last added page * * Attempt to add a page to the bio_vec maplist. This can fail for a * number of reasons, such as the bio being full or target block device * limitations. The target block device must allow bio's up to PAGE_SIZE, * so it is always possible to add a single page to an empty bio. * - * This should only be used by REQ_PC bios. + * This should only be used by passthrough bios. */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset) +static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + bool put_same_page) { - int retried_segments = 0; struct bio_vec *bvec; + bool same_page = false; /* * cloned bio must not modify vec list @@ -677,18 +706,14 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) return 0; - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (page == prev->bv_page && - offset == prev->bv_offset + prev->bv_len) { - prev->bv_len += len; - bio->bi_iter.bi_size += len; + if (page == bvec->bv_page && + offset == bvec->bv_offset + bvec->bv_len) { + if (put_same_page) + put_page(page); + bvec->bv_len += len; goto done; } @@ -696,72 +721,59 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvec_gap_to_prev(q, prev, offset)) + if (bvec_gap_to_prev(q, bvec, offset)) return 0; + + if (page_is_mergeable(bvec, page, len, offset, &same_page) && + can_add_page_to_seg(q, bvec, page, len, offset)) { + bvec->bv_len += len; + goto done; + } } - if (bio_full(bio)) + if (bio_full(bio, len)) + return 0; + + if (bio->bi_phys_segments >= queue_max_segments(q)) return 0; - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ bvec = &bio->bi_io_vec[bio->bi_vcnt]; bvec->bv_page = page; bvec->bv_len = len; bvec->bv_offset = offset; bio->bi_vcnt++; - bio->bi_phys_segments++; - bio->bi_iter.bi_size += len; - - /* - * Perform a recount if the number of segments is greater - * than queue_max_segments(q). - */ - - while (bio->bi_phys_segments > queue_max_segments(q)) { - - if (retried_segments) - goto failed; - - retried_segments = 1; - blk_recount_segments(q, bio); - } - - /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec)) - bio_clear_flag(bio, BIO_SEG_VALID); - done: + bio->bi_iter.bi_size += len; + bio->bi_phys_segments = bio->bi_vcnt; + bio_set_flag(bio, BIO_SEG_VALID); return len; +} - failed: - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - bio->bi_vcnt--; - bio->bi_iter.bi_size -= len; - blk_recount_segments(q, bio); - return 0; +int bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset) +{ + return __bio_add_pc_page(q, bio, page, len, offset, false); } EXPORT_SYMBOL(bio_add_pc_page); /** * __bio_try_merge_page - try appending data to an existing bvec. * @bio: destination bio - * @page: page to add + * @page: start page to add * @len: length of the data to add - * @off: offset of the data in @page + * @off: offset of the data relative to @page + * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a * a useful optimisation for file systems with a block size smaller than the * page size. * + * Warn if (@len, @off) crosses pages in case that @same_page is true. + * * Return %true on success or %false on failure. */ bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off) + unsigned int len, unsigned int off, bool *same_page) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return false; @@ -769,7 +781,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) { + if (page_is_mergeable(bv, page, len, off, same_page)) { bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -780,11 +792,11 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(__bio_try_merge_page); /** - * __bio_add_page - add page to a bio in a new segment + * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio - * @page: page to add - * @len: length of the data to add - * @off: offset of the data in @page + * @page: start page to add + * @len: length of the data to add, may cross pages + * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. @@ -795,7 +807,7 @@ void __bio_add_page(struct bio *bio, struct page *page, struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - WARN_ON_ONCE(bio_full(bio)); + WARN_ON_ONCE(bio_full(bio, len)); bv->bv_page = page; bv->bv_offset = off; @@ -807,20 +819,22 @@ void __bio_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL_GPL(__bio_add_page); /** - * bio_add_page - attempt to add page to bio + * bio_add_page - attempt to add page(s) to bio * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset + * @page: start page to add + * @len: vec entry length, may cross pages + * @offset: vec entry offset relative to @page, may cross pages * - * Attempt to add a page to the bio_vec maplist. This will only fail + * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - if (!__bio_try_merge_page(bio, page, len, offset)) { - if (bio_full(bio)) + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (bio_full(bio, len)) return 0; __bio_add_page(bio, page, len, offset); } @@ -828,6 +842,42 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); +static void bio_get_pages(struct bio *bio) +{ + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, iter_all) + get_page(bvec->bv_page); +} + +static void bio_release_pages(struct bio *bio) +{ + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + + bio_for_each_segment_all(bvec, bio, iter_all) + put_page(bvec->bv_page); +} + +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +{ + const struct bio_vec *bv = iter->bvec; + unsigned int len; + size_t size; + + if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) + return -EINVAL; + + len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); + size = bio_add_page(bio, bv->bv_page, len, + bv->bv_offset + iter->iov_offset); + if (unlikely(size != len)) + return -EINVAL; + iov_iter_advance(iter, size); + return 0; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -846,6 +896,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + bool same_page = false; ssize_t size, left; unsigned len, i; size_t offset; @@ -866,8 +917,15 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page *page = pages[i]; len = min_t(size_t, PAGE_SIZE - offset, left); - if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len)) - return -EINVAL; + + if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + put_page(page); + } else { + if (WARN_ON_ONCE(bio_full(bio, len))) + return -EINVAL; + __bio_add_page(bio, page, len, offset); + } offset = 0; } @@ -876,30 +934,46 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } /** - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to - * @iter: iov iterator describing the region to be mapped + * @iter: iov iterator describing the region to be added + * + * This takes either an iterator pointing to user memory, or one pointing to + * kernel pages (BVEC iterator). If we're adding user pages, we pin them and + * map them into the kernel. On IO completion, the caller should put those + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. * The function tries, but does not guarantee, to pin as many pages as - * fit into the bio, or are requested in *iter, whatever is smaller. - * If MM encounters an error pinning the requested pages, it stops. - * Error is returned only if 0 pages could be pinned. + * fit into the bio, or are requested in *iter, whatever is smaller. If + * MM encounters an error pinning the requested pages, it stops. Error + * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - unsigned short orig_vcnt = bio->bi_vcnt; + const bool is_bvec = iov_iter_is_bvec(iter); + int ret; - do { - int ret = __bio_iov_iter_get_pages(bio, iter); + if (WARN_ON_ONCE(bio->bi_vcnt)) + return -EINVAL; - if (unlikely(ret)) - return bio->bi_vcnt > orig_vcnt ? 0 : ret; + do { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - } while (iov_iter_count(iter) && !bio_full(bio)); + if (iov_iter_bvec_no_ref(iter)) + bio_set_flag(bio, BIO_NO_PAGE_REF); + else if (is_bvec) + bio_get_pages(bio); - return 0; + return bio->bi_vcnt ? 0 : ret; } static void submit_bio_wait_endio(struct bio *bio) @@ -1070,10 +1144,10 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, */ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) { - int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_from_iter(bvec->bv_page, @@ -1101,10 +1175,10 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) */ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) { - int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_to_iter(bvec->bv_page, @@ -1125,9 +1199,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); @@ -1238,8 +1312,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q, } } - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); break; + } len -= bytes; offset = 0; @@ -1295,6 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, struct bio *bio; int ret; struct bio_vec *bvec; + struct bvec_iter_all iter_all; if (!iov_iter_count(iter)) return ERR_PTR(-EINVAL); @@ -1324,21 +1402,14 @@ struct bio *bio_map_user_iov(struct request_queue *q, for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; - unsigned short prev_bi_vcnt = bio->bi_vcnt; if (n > bytes) n = bytes; - if (!bio_add_pc_page(q, bio, page, n, offs)) + if (!__bio_add_pc_page(q, bio, page, n, offs, + true)) break; - /* - * check if vector was merged with previous - * drop page reference if needed - */ - if (bio->bi_vcnt == prev_bi_vcnt) - put_page(page); - added += n; bytes -= n; offs = 0; @@ -1368,7 +1439,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, return bio; out_unmap: - bio_for_each_segment_all(bvec, bio, j) { + bio_for_each_segment_all(bvec, bio, iter_all) { put_page(bvec->bv_page); } bio_put(bio); @@ -1378,12 +1449,12 @@ struct bio *bio_map_user_iov(struct request_queue *q, static void __bio_unmap_user(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; /* * make sure we dirty pages we wrote to */ - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1474,9 +1545,9 @@ static void bio_copy_kern_endio_read(struct bio *bio) { char *p = bio->bi_private; struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { memcpy(p, page_address(bvec->bv_page), bvec->bv_len); p += bvec->bv_len; } @@ -1584,23 +1655,14 @@ cleanup: void bio_set_pages_dirty(struct bio *bio) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { if (!PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); } } -static void bio_release_pages(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); -} - /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must @@ -1634,7 +1696,8 @@ static void bio_dirty_fn(struct work_struct *work) next = bio->bi_private; bio_set_pages_dirty(bio); - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); } } @@ -1643,14 +1706,15 @@ void bio_check_pages_dirty(struct bio *bio) { struct bio_vec *bvec; unsigned long flags; - int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) goto defer; } - bio_release_pages(bio); + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + bio_release_pages(bio); bio_put(bio); return; defer: @@ -2132,6 +2196,9 @@ static int __init init_bio(void) bio_slab_nr = 0; bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), GFP_KERNEL); + + BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); + if (!bio_slabs) panic("bio: can't allocate bios\n"); |